From 9599ba901487b7fff63cb8868b8524cb9c321f68 Mon Sep 17 00:00:00 2001
From: xuweichen <xuweichen1999@126.com>
Date: Sun, 4 Sep 2022 23:15:37 +0800
Subject: [PATCH] style(engine, deployor): revise code style

---
 data/data_slimmor/__init__.py                 |  Bin 526 -> 6 bytes
 engine/README.md                              |  318 +-
 ...pillars_160e_kitti-3d-car.py => config.py} |    0
 ...s_160e_kitti-3d-car_bk.py => config_bk.py} |    0
 ...pointpillars_engine.py => engine_utils.py} |    2 +-
 engine/engineor.py                            |   10 +-
 .../.ipynb_checkpoints/__init__-checkpoint.py |   49 +
 mmdet3d/__init__.py                           |   98 +-
 mmdet3d/apis/__init__.py                      |   28 +-
 mmdet3d/apis/inference.py                     | 1052 ++--
 mmdet3d/apis/test.py                          |  180 +-
 mmdet3d/apis/train.py                         |  702 +--
 mmdet3d/core/__init__.py                      |   18 +-
 mmdet3d/core/anchor/__init__.py               |   20 +-
 mmdet3d/core/anchor/anchor_3d_generator.py    |  838 +--
 mmdet3d/core/bbox/__init__.py                 |   60 +-
 mmdet3d/core/bbox/assigners/__init__.py       |    8 +-
 mmdet3d/core/bbox/box_np_ops.py               | 1654 +++---
 mmdet3d/core/bbox/coders/__init__.py          |   38 +-
 .../bbox/coders/anchor_free_bbox_coder.py     |  260 +-
 .../bbox/coders/centerpoint_bbox_coders.py    |  458 +-
 .../bbox/coders/delta_xyzwhlr_bbox_coder.py   |  182 +-
 mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py |  254 +-
 .../bbox/coders/groupfree3d_bbox_coder.py     |  382 +-
 .../core/bbox/coders/monoflex_bbox_coder.py   | 1030 ++--
 .../coders/partial_bin_based_bbox_coder.py    |  482 +-
 mmdet3d/core/bbox/coders/pgd_bbox_coder.py    |  256 +-
 .../bbox/coders/point_xyzwhlr_bbox_coder.py   |  234 +-
 mmdet3d/core/bbox/coders/smoke_bbox_coder.py  |  416 +-
 mmdet3d/core/bbox/iou_calculators/__init__.py |   22 +-
 .../bbox/iou_calculators/iou3d_calculator.py  |  658 +--
 mmdet3d/core/bbox/samplers/__init__.py        |   26 +-
 .../samplers/iou_neg_piecewise_sampler.py     |  366 +-
 mmdet3d/core/bbox/structures/__init__.py      |   36 +-
 mmdet3d/core/bbox/structures/base_box3d.py    | 1156 ++--
 mmdet3d/core/bbox/structures/box_3d_mode.py   |  394 +-
 mmdet3d/core/bbox/structures/cam_box3d.py     |  708 +--
 mmdet3d/core/bbox/structures/coord_3d_mode.py |  468 +-
 mmdet3d/core/bbox/structures/depth_box3d.py   |  540 +-
 mmdet3d/core/bbox/structures/lidar_box3d.py   |  420 +-
 mmdet3d/core/bbox/structures/utils.py         |  670 +--
 mmdet3d/core/bbox/transforms.py               |  152 +-
 mmdet3d/core/evaluation/__init__.py           |   22 +-
 mmdet3d/core/evaluation/indoor_eval.py        |  618 +-
 mmdet3d/core/evaluation/instance_seg_eval.py  |  256 +-
 .../core/evaluation/kitti_utils/__init__.py   |    8 +-
 mmdet3d/core/evaluation/kitti_utils/eval.py   | 1900 +++----
 .../core/evaluation/kitti_utils/rotate_iou.py |  758 +--
 mmdet3d/core/evaluation/lyft_eval.py          |  570 +-
 .../core/evaluation/scannet_utils/__init__.py |    8 +-
 .../evaluate_semantic_instance.py             |  694 +--
 .../core/evaluation/scannet_utils/util_3d.py  |  168 +-
 mmdet3d/core/evaluation/seg_eval.py           |  262 +-
 .../core/evaluation/waymo_utils/__init__.py   |    8 +-
 .../waymo_utils/prediction_kitti_to_waymo.py  |  526 +-
 mmdet3d/core/points/__init__.py               |   60 +-
 mmdet3d/core/points/base_points.py            |  880 +--
 mmdet3d/core/points/cam_points.py             |  126 +-
 mmdet3d/core/points/depth_points.py           |  116 +-
 mmdet3d/core/points/lidar_points.py           |  116 +-
 mmdet3d/core/post_processing/__init__.py      |   28 +-
 mmdet3d/core/post_processing/box3d_nms.py     |  576 +-
 mmdet3d/core/post_processing/merge_augs.py    |  184 +-
 mmdet3d/core/utils/__init__.py                |   20 +-
 mmdet3d/core/utils/array_converter.py         |  648 +--
 mmdet3d/core/utils/gaussian.py                |  316 +-
 mmdet3d/core/visualizer/__init__.py           |   10 +-
 mmdet3d/core/visualizer/image_vis.py          |  412 +-
 mmdet3d/core/visualizer/open3d_vis.py         |  920 +--
 mmdet3d/core/visualizer/show_result.py        |  582 +-
 mmdet3d/core/voxel/__init__.py                |   10 +-
 mmdet3d/core/voxel/builder.py                 |   32 +-
 mmdet3d/core/voxel/voxel_generator.py         |  560 +-
 mmdet3d/datasets/__init__.py                  |   94 +-
 mmdet3d/datasets/builder.py                   |   94 +-
 mmdet3d/datasets/custom_3d.py                 |  896 +--
 mmdet3d/datasets/custom_3d_seg.py             |  930 +--
 mmdet3d/datasets/dataset_wrappers.py          |  152 +-
 mmdet3d/datasets/kitti2d_dataset.py           |  482 +-
 mmdet3d/datasets/kitti_dataset.py             | 1550 ++---
 mmdet3d/datasets/kitti_mono_dataset.py        | 1138 ++--
 mmdet3d/datasets/lyft_dataset.py              | 1134 ++--
 mmdet3d/datasets/nuscenes_dataset.py          | 1308 ++---
 mmdet3d/datasets/nuscenes_mono_dataset.py     | 1680 +++---
 .../.ipynb_checkpoints/__init__-checkpoint.py |   68 +-
 .../transforms_3d-checkpoint.py               | 3706 ++++++------
 mmdet3d/datasets/pipelines/__init__.py        |   68 +-
 mmdet3d/datasets/pipelines/compose.py         |  120 +-
 .../datasets/pipelines/data_augment_utils.py  |  822 +--
 mmdet3d/datasets/pipelines/dbsampler.py       |  680 +--
 mmdet3d/datasets/pipelines/formating.py       |  532 +-
 mmdet3d/datasets/pipelines/loading.py         | 1370 ++---
 mmdet3d/datasets/pipelines/test_time_aug.py   |  458 +-
 mmdet3d/datasets/pipelines/transforms_3d.py   | 5010 ++++++++---------
 mmdet3d/datasets/s3dis_dataset.py             |  890 +--
 mmdet3d/datasets/scannet_dataset.py           | 1228 ++--
 mmdet3d/datasets/semantickitti_dataset.py     |  220 +-
 mmdet3d/datasets/sunrgbd_dataset.py           |  560 +-
 mmdet3d/datasets/utils.py                     |  280 +-
 mmdet3d/datasets/waymo_dataset.py             | 1098 ++--
 mmdet3d/models/__init__.py                    |   58 +-
 mmdet3d/models/backbones/__init__.py          |   32 +-
 mmdet3d/models/backbones/base_pointnet.py     |   78 +-
 mmdet3d/models/backbones/dgcnn.py             |  196 +-
 mmdet3d/models/backbones/dla.py               |  892 +--
 mmdet3d/models/backbones/mink_resnet.py       |  232 +-
 mmdet3d/models/backbones/multi_backbone.py    |  254 +-
 mmdet3d/models/backbones/nostem_regnet.py     |  168 +-
 mmdet3d/models/backbones/pointnet2_sa_msg.py  |  350 +-
 mmdet3d/models/backbones/pointnet2_sa_ssg.py  |  286 +-
 mmdet3d/models/backbones/second.py            |  182 +-
 mmdet3d/models/builder.py                     |  274 +-
 mmdet3d/models/decode_heads/__init__.py       |   12 +-
 mmdet3d/models/decode_heads/decode_head.py    |  246 +-
 mmdet3d/models/decode_heads/dgcnn_head.py     |  134 +-
 mmdet3d/models/decode_heads/paconv_head.py    |  126 +-
 mmdet3d/models/decode_heads/pointnet2_head.py |  170 +-
 mmdet3d/models/dense_heads/__init__.py        |   50 +-
 mmdet3d/models/dense_heads/anchor3d_head.py   | 1032 ++--
 .../dense_heads/anchor_free_mono3d_head.py    | 1068 ++--
 .../models/dense_heads/base_conv_bbox_head.py |  262 +-
 .../dense_heads/base_mono3d_dense_head.py     |  156 +-
 .../models/dense_heads/centerpoint_head.py    | 1660 +++---
 .../models/dense_heads/fcos_mono3d_head.py    | 1912 +++----
 .../models/dense_heads/free_anchor3d_head.py  |  570 +-
 .../models/dense_heads/groupfree3d_head.py    | 1988 +++----
 mmdet3d/models/dense_heads/monoflex_head.py   | 1542 ++---
 mmdet3d/models/dense_heads/parta2_rpn_head.py |  620 +-
 mmdet3d/models/dense_heads/pgd_head.py        | 2458 ++++----
 mmdet3d/models/dense_heads/point_rpn_head.py  |  762 +--
 .../models/dense_heads/shape_aware_head.py    | 1030 ++--
 .../models/dense_heads/smoke_mono3d_head.py   | 1032 ++--
 mmdet3d/models/dense_heads/ssd_3d_head.py     | 1114 ++--
 mmdet3d/models/dense_heads/train_mixins.py    |  698 +--
 mmdet3d/models/dense_heads/vote_head.py       | 1326 ++---
 mmdet3d/models/detectors/__init__.py          |   54 +-
 mmdet3d/models/detectors/base.py              |  254 +-
 mmdet3d/models/detectors/centerpoint.py       |  392 +-
 mmdet3d/models/detectors/dynamic_voxelnet.py  |  142 +-
 mmdet3d/models/detectors/fcos_mono3d.py       |   44 +-
 mmdet3d/models/detectors/groupfree3dnet.py    |  210 +-
 mmdet3d/models/detectors/h3dnet.py            |  352 +-
 mmdet3d/models/detectors/imvotenet.py         | 1638 +++---
 mmdet3d/models/detectors/imvoxelnet.py        |  276 +-
 mmdet3d/models/detectors/mvx_faster_rcnn.py   |  122 +-
 mmdet3d/models/detectors/mvx_two_stage.py     | 1006 ++--
 mmdet3d/models/detectors/parta2.py            |  302 +-
 mmdet3d/models/detectors/point_rcnn.py        |  296 +-
 mmdet3d/models/detectors/sassd.py             |  272 +-
 mmdet3d/models/detectors/single_stage.py      |  142 +-
 .../models/detectors/single_stage_mono3d.py   |  500 +-
 mmdet3d/models/detectors/smoke_mono3d.py      |   42 +-
 mmdet3d/models/detectors/ssd3dnet.py          |   52 +-
 mmdet3d/models/detectors/two_stage.py         |  102 +-
 mmdet3d/models/detectors/votenet.py           |  214 +-
 mmdet3d/models/detectors/voxelnet.py          |  260 +-
 mmdet3d/models/fusion_layers/__init__.py      |   20 +-
 .../models/fusion_layers/coord_transform.py   |  432 +-
 mmdet3d/models/fusion_layers/point_fusion.py  |  612 +-
 mmdet3d/models/fusion_layers/vote_fusion.py   |  400 +-
 mmdet3d/models/losses/__init__.py             |   28 +-
 .../models/losses/axis_aligned_iou_loss.py    |  158 +-
 mmdet3d/models/losses/chamfer_distance.py     |  294 +-
 mmdet3d/models/losses/multibin_loss.py        |  186 +-
 .../losses/paconv_regularization_loss.py      |  216 +-
 .../models/losses/uncertain_smooth_l1_loss.py |  352 +-
 mmdet3d/models/middle_encoders/__init__.py    |   16 +-
 .../models/middle_encoders/pillar_scatter.py  |  204 +-
 .../models/middle_encoders/sparse_encoder.py  |  982 ++--
 mmdet3d/models/middle_encoders/sparse_unet.py |  600 +-
 mmdet3d/models/model_utils/__init__.py        |   12 +-
 .../models/model_utils/edge_fusion_module.py  |  156 +-
 mmdet3d/models/model_utils/transformer.py     |  278 +-
 mmdet3d/models/model_utils/vote_module.py     |  368 +-
 mmdet3d/models/necks/__init__.py              |   20 +-
 mmdet3d/models/necks/dla_neck.py              |  466 +-
 mmdet3d/models/necks/imvoxel_neck.py          |  220 +-
 mmdet3d/models/necks/pointnet2_fp_neck.py     |  178 +-
 mmdet3d/models/necks/second_fpn.py            |  182 +-
 mmdet3d/models/roi_heads/__init__.py          |   28 +-
 mmdet3d/models/roi_heads/base_3droi_head.py   |  196 +-
 .../models/roi_heads/bbox_heads/__init__.py   |   28 +-
 .../roi_heads/bbox_heads/h3d_bbox_head.py     | 1850 +++---
 .../roi_heads/bbox_heads/parta2_bbox_head.py  | 1258 ++---
 .../bbox_heads/point_rcnn_bbox_head.py        | 1150 ++--
 mmdet3d/models/roi_heads/h3d_roi_head.py      |  318 +-
 .../models/roi_heads/mask_heads/__init__.py   |   10 +-
 .../mask_heads/pointwise_semantic_head.py     |  404 +-
 .../roi_heads/mask_heads/primitive_head.py    | 1932 +++----
 .../roi_heads/part_aggregation_roi_head.py    |  650 +--
 .../models/roi_heads/point_rcnn_roi_head.py   |  572 +-
 .../roi_heads/roi_extractors/__init__.py      |   18 +-
 .../single_roiaware_extractor.py              |  108 +-
 .../single_roipoint_extractor.py              |  128 +-
 mmdet3d/models/segmentors/__init__.py         |   10 +-
 mmdet3d/models/segmentors/base.py             |  272 +-
 mmdet3d/models/segmentors/encoder_decoder.py  |  908 +--
 mmdet3d/models/utils/__init__.py              |   22 +-
 mmdet3d/models/utils/clip_sigmoid.py          |   34 +-
 mmdet3d/models/utils/edge_indices.py          |  176 +-
 mmdet3d/models/utils/gen_keypoints.py         |  160 +-
 mmdet3d/models/utils/handle_objs.py           |  270 +-
 mmdet3d/models/utils/mlp.py                   |  102 +-
 mmdet3d/models/voxel_encoders/__init__.py     |   16 +-
 .../models/voxel_encoders/pillar_encoder.py   |  646 +--
 mmdet3d/models/voxel_encoders/utils.py        |  364 +-
 .../models/voxel_encoders/voxel_encoder.py    |  978 ++--
 mmdet3d/ops/__init__.py                       |   96 +-
 mmdet3d/ops/dgcnn_modules/__init__.py         |   12 +-
 mmdet3d/ops/dgcnn_modules/dgcnn_fa_module.py  |  136 +-
 mmdet3d/ops/dgcnn_modules/dgcnn_fp_module.py  |  118 +-
 mmdet3d/ops/dgcnn_modules/dgcnn_gf_module.py  |  442 +-
 mmdet3d/ops/norm.py                           |  326 +-
 mmdet3d/ops/paconv/__init__.py                |    8 +-
 mmdet3d/ops/paconv/paconv.py                  |  784 +--
 mmdet3d/ops/paconv/utils.py                   |  174 +-
 mmdet3d/ops/pointnet_modules/__init__.py      |   24 +-
 mmdet3d/ops/pointnet_modules/builder.py       |   78 +-
 .../ops/pointnet_modules/paconv_sa_module.py  |  684 +--
 .../ops/pointnet_modules/point_fp_module.py   |  158 +-
 .../ops/pointnet_modules/point_sa_module.py   |  704 +--
 mmdet3d/ops/sparse_block.py                   |  398 +-
 mmdet3d/ops/spconv/__init__.py                |   28 +-
 .../ops/spconv/overwrite_spconv/__init__.py   |    8 +-
 .../spconv/overwrite_spconv/write_spconv2.py  |  236 +-
 mmdet3d/utils/__init__.py                     |   28 +-
 mmdet3d/utils/collect_env.py                  |   46 +-
 mmdet3d/utils/compat_cfg.py                   |  278 +-
 mmdet3d/utils/logger.py                       |   62 +-
 mmdet3d/utils/misc.py                         |   78 +-
 mmdet3d/utils/setup_env.py                    |  106 +-
 mmdet3d/version.py                            |   38 +-
 model/model_deployor/deployor.py              |   39 +-
 model/model_deployor/deployor_utils.py        |  235 +-
 model/model_deployor/onnx2tensorrt.py         |  197 -
 test/test_engine/test_engine.py               |   16 +-
 test/test_model_ops/test_deployor.py          |   36 +-
 tools/deploy.py                               |    2 +-
 tools/engines.py                              |    4 +-
 239 files changed, 53430 insertions(+), 53386 deletions(-)
 rename engine/{pointpillars_160e_kitti-3d-car.py => config.py} (100%)
 rename engine/{pointpillars_160e_kitti-3d-car_bk.py => config_bk.py} (100%)
 rename engine/{pointpillars_engine.py => engine_utils.py} (97%)
 create mode 100644 mmdet3d/.ipynb_checkpoints/__init__-checkpoint.py
 delete mode 100644 model/model_deployor/onnx2tensorrt.py

diff --git a/data/data_slimmor/__init__.py b/data/data_slimmor/__init__.py
index 80abd14f9651dfd44235bf317d3adcdcc9f595cd..b03dfc466802d828c7baa4ff319d05237b93e18c 100644
GIT binary patch
literal 6
NcmezWkBfne0RRg!0ulfK

literal 526
zcmZWmJx{_=6g>km+CbEmscvXc0)F7)qJ-$cBM}rk7$Z;=L!q&>sKhYX&BTO79sLKo
zvN`GG>OXKZx;U7aSkG%+jKgd0JLmM=bMN!MlbC^u@$#z(X&2&NtcqN|*;5ejCXq&@
z81)1)C{VM+vw%hF=MnIdO(R7lP2|&g+g35hTMAQRU@AVN;=S-99{bM)*V*jqwt|U+
z^U}lRSUKuNM7Zdng(K)_(A`2EC#+qk#BUvY*hiaHmE7{L&SIT~?Q2_TB8C#VEGCId
z*PzEo?NoCefgF=Zy$ro9I_*+lq2Dwd)`vtT(&nFQbl)J}`PVn_P!Ko{%)T)7X2rb@
zX+v!VU&hUP1}hL4_HgyozteMlh>~_|oo$7^UOr}<IA(_yyHz-i+;)=lNq)prVEByA
z1XX5{2~{4OlN(GTvq`j0Rg=s>AKg(g%lA;}C$m?{82+vGM;j+*!$iVYbf`>l!#ZkI
SCH%>Q-Ks<y^f&o_3Vr}jcYXu_

diff --git a/engine/README.md b/engine/README.md
index f00f823..5512a27 100644
--- a/engine/README.md
+++ b/engine/README.md
@@ -1,159 +1,159 @@
-# Engine
-
-Tiny3D uses a warp engine for the pytorch model to train and evaluate, and the trained parameters can be extracted for deployment and compression. The relationship between the engine and the other modules is shown in the following diagram.
-
-<img src="Tiny3D_engine.png" alt="image-20220603163849335" style="zoom:67%;" />
-
-We built Pytorch Lighting Module for pointpillars, You can refer to [PyTorch Lightning](https://www.pytorchlightning.ai/) to build your model.
-
-```python
-class Pointpillars_engine(pl.LightningModule):
-	def __init__(self):
-		# define pytorch networks.
-        
-    def init_weights(self):
-        # define pytorch network init.
-    
-    def forward(self, img_metas, points):
-        # define the full inference process for an image. Note that this function is not used in training and validation, but               only when it is called explicitly.
-
-	def configure_optimizers(self):
-        # define optimizers and learning rate control strategies
-
-	def training_step(self, train_batch, batch_idx):
-        # Define the forward process of a batch during training and how to calculate the loss
-
-	def validation_step(self, val_batch, batch_idx):
-        # Defining the forward and post-processing of a batch when evaluate
-		
-    def validation_epoch_end(self, outputs):
-        # how to calculate metrics for all results
-        
-    def predict_step(self, batch, batch_idx, dataloader_idx=0):
-        # define the forward and post-processing of a batch when reasoning
-```
-
-
-
-After defining the Pytorch model, we wrap it into a Pytorch Lighting model as:
-
-```python
-torch_model = Pointpillars()
-model = Pointpillars_engine(torch_model)
-```
-
-
-
-## Usage
-
-You should first install Pytorch Lighting as:
-
-```shell
-pip install pytorch-lightning
-```
-
-Then prepare Kitti dataset first as https://github.com/open-mmlab/mmdetection3d/blob/master/docs/zh_cn/datasets/kitti_det.md
-
-
-
-We offer five ways to use it:
-
-**fit**: train from scratch or fine-tune your pytorch model
-
-**eval**:  evaluate your pytorch model and calculate metrics
-
-**predict**: inference a batch of point cloud data
-
-**inference**: inference a point cloud data
-
-**infer_production**: unwarp Pytorch Lighting model to get the Pytorch model, and then infer a point cloud data
-
-
-
-You can execute scripts as:
-
-```shell
-python tools/engines.py
---model_name
-pointpillars
---mode
-fit
---config
-engine/pointpillars_160e_kitti-3d-car.py
-```
-
-
-
-You can download our pointpillar network trained using pytorch lighing from [best-epoch=159-ap=76.5387.ckpt](https://drive.google.com/file/d/1GhOuRm_9DOR7FWSKxO-5NMPYO2SfkJNh/view?usp=sharing)
-
-
-
-## Extension
-
-**For Deployor or Compressor:**
-
-You can wrap the Pytorch model into a Pytorch Lighting model for training. After training, you can unwarp it in the following way to get the Pytorch model for deployment or compression.
-
-```python
-checkpoint = torch.load("lightning_logs/version_9/checkpoints/epoch=3-step=14848.ckpt")
-state_dict = checkpoint["state_dict"]
-
-# update keys by dropping `torch_model.`
-for key in list(state_dict):
-     state_dict[key.replace("torch_model.", "")] = state_dict.pop(key)
-
-torch_model.load_state_dict(state_dict)
-```
-
-
-
-**For Data ops:**
-
-You can add data ops to the data pipeline in the configuration file for training or evaluation. 
-
-**Note**: You need the function that converts data ops to [mmdet3d](https://github.com/open-mmlab/mmdetection3d) format.
-
-We have converted some data ops into the above format and added them to the configuration file, you can refer to them.
-
-```python
-@PIPELINES.register_module()
-class DataFilter(object):
-
-    def __init__(self, method='remove_statistical_outlier', params={"nb_neighbors" : 20, "std_ratio" : 0.05}):
-        self.method = method
-        self.params = params
-
-    def remove_statistical_outlier(self, lidar_data, nb_neighbors, std_ratio):
-
-        return filtered_lidar_data
-
-    def remove_radius_outlier(self, lidar_data, nb_points, radius):
-
-        return filtered_lidar_data
-
-    def voxel_down_sample(self, lidar_data, voxel_size):
-
-        return filtered_lidar_data
-
-
-    def __call__(self, results):
-
-        return results
-```
-
-
-
-```python
-train=dict(
-        type='KittiDataset',
-        data_root='kitti/',
-        ann_file='kitti/kitti_infos_train.pkl',
-        split='training',
-        pts_prefix='velodyne_reduced',
-        pipeline=[
-            dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
-            dict(
-                type='DataFilter',
-                method='remove_statistical_outlier',
-                params={"nb_neighbors": 20, "std_ratio": 0.05}),
-```
+# Engine
+
+Tiny3D uses a warp engine for the pytorch model to train and evaluate, and the trained parameters can be extracted for deployment and compression. The relationship between the engine and the other modules is shown in the following diagram.
+
+<img src="Tiny3D_engine.png" alt="image-20220603163849335" style="zoom:67%;" />
+
+We built Pytorch Lighting Module for pointpillars, You can refer to [PyTorch Lightning](https://www.pytorchlightning.ai/) to build your model.
+
+```python
+class Engine(pl.LightningModule):
+	def __init__(self):
+		# define pytorch networks.
+        
+    def init_weights(self):
+        # define pytorch network init.
+    
+    def forward(self, img_metas, points):
+        # define the full inference process for an image. Note that this function is not used in training and validation, but               only when it is called explicitly.
+
+	def configure_optimizers(self):
+        # define optimizers and learning rate control strategies
+
+	def training_step(self, train_batch, batch_idx):
+        # Define the forward process of a batch during training and how to calculate the loss
+
+	def validation_step(self, val_batch, batch_idx):
+        # Defining the forward and post-processing of a batch when evaluate
+		
+    def validation_epoch_end(self, outputs):
+        # how to calculate metrics for all results
+        
+    def predict_step(self, batch, batch_idx, dataloader_idx=0):
+        # define the forward and post-processing of a batch when reasoning
+```
+
+
+
+After defining the Pytorch model, we wrap it into a Pytorch Lighting model as:
+
+```python
+torch_model = Pointpillars()
+model = Engine(torch_model)
+```
+
+
+
+## Usage
+
+You should first install Pytorch Lighting as:
+
+```shell
+pip install pytorch-lightning
+```
+
+Then prepare Kitti dataset first as https://github.com/open-mmlab/mmdetection3d/blob/master/docs/zh_cn/datasets/kitti_det.md
+
+
+
+We offer five ways to use it:
+
+**fit**: train from scratch or fine-tune your pytorch model
+
+**eval**:  evaluate your pytorch model and calculate metrics
+
+**predict**: inference a batch of point cloud data
+
+**inference**: inference a point cloud data
+
+**infer_production**: unwarp Pytorch Lighting model to get the Pytorch model, and then infer a point cloud data
+
+
+
+You can execute scripts as:
+
+```shell
+python tools/engines.py
+--model_name
+pointpillars
+--mode
+fit
+--config
+engine/config.py
+```
+
+
+
+You can download our pointpillar network trained using pytorch lighing from [best-epoch=159-ap=76.5387.ckpt](https://drive.google.com/file/d/1GhOuRm_9DOR7FWSKxO-5NMPYO2SfkJNh/view?usp=sharing)
+
+
+
+## Extension
+
+**For Deployor or Compressor:**
+
+You can wrap the Pytorch model into a Pytorch Lighting model for training. After training, you can unwarp it in the following way to get the Pytorch model for deployment or compression.
+
+```python
+checkpoint = torch.load("lightning_logs/version_9/checkpoints/epoch=3-step=14848.ckpt")
+state_dict = checkpoint["state_dict"]
+
+# update keys by dropping `torch_model.`
+for key in list(state_dict):
+     state_dict[key.replace("torch_model.", "")] = state_dict.pop(key)
+
+torch_model.load_state_dict(state_dict)
+```
+
+
+
+**For Data ops:**
+
+You can add data ops to the data pipeline in the configuration file for training or evaluation. 
+
+**Note**: You need the function that converts data ops to [mmdet3d](https://github.com/open-mmlab/mmdetection3d) format.
+
+We have converted some data ops into the above format and added them to the configuration file, you can refer to them.
+
+```python
+@PIPELINES.register_module()
+class DataFilter(object):
+
+    def __init__(self, method='remove_statistical_outlier', params={"nb_neighbors" : 20, "std_ratio" : 0.05}):
+        self.method = method
+        self.params = params
+
+    def remove_statistical_outlier(self, lidar_data, nb_neighbors, std_ratio):
+
+        return filtered_lidar_data
+
+    def remove_radius_outlier(self, lidar_data, nb_points, radius):
+
+        return filtered_lidar_data
+
+    def voxel_down_sample(self, lidar_data, voxel_size):
+
+        return filtered_lidar_data
+
+
+    def __call__(self, results):
+
+        return results
+```
+
+
+
+```python
+train=dict(
+        type='KittiDataset',
+        data_root='kitti/',
+        ann_file='kitti/kitti_infos_train.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=[
+            dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+            dict(
+                type='DataFilter',
+                method='remove_statistical_outlier',
+                params={"nb_neighbors": 20, "std_ratio": 0.05}),
+```
diff --git a/engine/pointpillars_160e_kitti-3d-car.py b/engine/config.py
similarity index 100%
rename from engine/pointpillars_160e_kitti-3d-car.py
rename to engine/config.py
diff --git a/engine/pointpillars_160e_kitti-3d-car_bk.py b/engine/config_bk.py
similarity index 100%
rename from engine/pointpillars_160e_kitti-3d-car_bk.py
rename to engine/config_bk.py
diff --git a/engine/pointpillars_engine.py b/engine/engine_utils.py
similarity index 97%
rename from engine/pointpillars_engine.py
rename to engine/engine_utils.py
index 95fae3b..e6b7297 100644
--- a/engine/pointpillars_engine.py
+++ b/engine/engine_utils.py
@@ -17,7 +17,7 @@
 from mmdet.models.losses import FocalLoss, CrossEntropyLoss, SmoothL1Loss
 
 
-class Pointpillars_engine(pl.LightningModule):
+class Engine(pl.LightningModule):
     def __init__(self, torch_model):
         super().__init__()
         self.torch_model = torch_model
diff --git a/engine/engineor.py b/engine/engineor.py
index 94d1c6d..adaaccb 100644
--- a/engine/engineor.py
+++ b/engine/engineor.py
@@ -1,7 +1,7 @@
 import torch
 import pytorch_lightning as pl
 from torch.utils.data import DataLoader
-from engine.pointpillars_engine import Pointpillars_engine
+from engine.engine_utils import Engine
 import pytorch_lightning.callbacks as plc
 from functools import partial
 from mmcv.parallel import collate
@@ -25,7 +25,7 @@ def fit(dataset_train, dataset_val, torch_model, epoch=80, devices=1, accelerato
             https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pytorch_lightning/
     """
 
-    model = Pointpillars_engine(torch_model)
+    model = Engine(torch_model)
 
     data_loader_train = DataLoader(
         dataset_train,
@@ -70,7 +70,7 @@ def eval(dataset_val, torch_model, weights=None, accelerator='gpu', devices=1):
             https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pytorch_lightning/
     """
 
-    model = Pointpillars_engine(torch_model)
+    model = Engine(torch_model)
     # load weights
     if weights != None:
         checkpoint = torch.load(weights)
@@ -101,7 +101,7 @@ def predict(dataset_val, torch_model, weights=None, accelerator='gpu', devices=1
             https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pytorch_lightning/
     """
 
-    model = Pointpillars_engine(torch_model)
+    model = Engine(torch_model)
     # load weights
     if weights != None:
         checkpoint = torch.load(weights)
@@ -133,7 +133,7 @@ def inference(pcd_data, torch_model, weights=None, accelerator='gpu', devices=1)
             https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pytorch_lightning/
     """
 
-    model = Pointpillars_engine(torch_model)
+    model = Engine(torch_model)
     # load weights
     if weights != None:
         checkpoint = torch.load(weights)
diff --git a/mmdet3d/.ipynb_checkpoints/__init__-checkpoint.py b/mmdet3d/.ipynb_checkpoints/__init__-checkpoint.py
new file mode 100644
index 0000000..312e9b4
--- /dev/null
+++ b/mmdet3d/.ipynb_checkpoints/__init__-checkpoint.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+
+import mmdet
+import mmseg
+from .version import __version__, short_version
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_minimum_version = '1.4.8'
+mmcv_maximum_version = '1.6.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+
+mmdet_minimum_version = '2.24.0'
+mmdet_maximum_version = '3.0.0'
+mmdet_version = digit_version(mmdet.__version__)
+assert (mmdet_version >= digit_version(mmdet_minimum_version)
+        and mmdet_version <= digit_version(mmdet_maximum_version)), \
+    f'MMDET=={mmdet.__version__} is used but incompatible. ' \
+    f'Please install mmdet>={mmdet_minimum_version}, ' \
+    f'<={mmdet_maximum_version}.'
+
+mmseg_minimum_version = '0.20.0'
+mmseg_maximum_version = '1.0.0'
+mmseg_version = digit_version(mmseg.__version__)
+assert (mmseg_version >= digit_version(mmseg_minimum_version)
+        and mmseg_version <= digit_version(mmseg_maximum_version)), \
+    f'MMSEG=={mmseg.__version__} is used but incompatible. ' \
+    f'Please install mmseg>={mmseg_minimum_version}, ' \
+    f'<={mmseg_maximum_version}.'
+
+__all__ = ['__version__', 'short_version']
diff --git a/mmdet3d/__init__.py b/mmdet3d/__init__.py
index 312e9b4..1efa706 100644
--- a/mmdet3d/__init__.py
+++ b/mmdet3d/__init__.py
@@ -1,49 +1,49 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
-
-import mmdet
-import mmseg
-from .version import __version__, short_version
-
-
-def digit_version(version_str):
-    digit_version = []
-    for x in version_str.split('.'):
-        if x.isdigit():
-            digit_version.append(int(x))
-        elif x.find('rc') != -1:
-            patch_version = x.split('rc')
-            digit_version.append(int(patch_version[0]) - 1)
-            digit_version.append(int(patch_version[1]))
-    return digit_version
-
-
-mmcv_minimum_version = '1.4.8'
-mmcv_maximum_version = '1.6.0'
-mmcv_version = digit_version(mmcv.__version__)
-
-
-assert (mmcv_version >= digit_version(mmcv_minimum_version)
-        and mmcv_version <= digit_version(mmcv_maximum_version)), \
-    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
-    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
-
-mmdet_minimum_version = '2.24.0'
-mmdet_maximum_version = '3.0.0'
-mmdet_version = digit_version(mmdet.__version__)
-assert (mmdet_version >= digit_version(mmdet_minimum_version)
-        and mmdet_version <= digit_version(mmdet_maximum_version)), \
-    f'MMDET=={mmdet.__version__} is used but incompatible. ' \
-    f'Please install mmdet>={mmdet_minimum_version}, ' \
-    f'<={mmdet_maximum_version}.'
-
-mmseg_minimum_version = '0.20.0'
-mmseg_maximum_version = '1.0.0'
-mmseg_version = digit_version(mmseg.__version__)
-assert (mmseg_version >= digit_version(mmseg_minimum_version)
-        and mmseg_version <= digit_version(mmseg_maximum_version)), \
-    f'MMSEG=={mmseg.__version__} is used but incompatible. ' \
-    f'Please install mmseg>={mmseg_minimum_version}, ' \
-    f'<={mmseg_maximum_version}.'
-
-__all__ = ['__version__', 'short_version']
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+
+import mmdet
+import mmseg
+from .version import __version__, short_version
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_minimum_version = '1.4.8'
+mmcv_maximum_version = '1.6.0'
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
+
+mmdet_minimum_version = '2.24.0'
+mmdet_maximum_version = '3.0.0'
+mmdet_version = digit_version(mmdet.__version__)
+assert (mmdet_version >= digit_version(mmdet_minimum_version)
+        and mmdet_version <= digit_version(mmdet_maximum_version)), \
+    f'MMDET=={mmdet.__version__} is used but incompatible. ' \
+    f'Please install mmdet>={mmdet_minimum_version}, ' \
+    f'<={mmdet_maximum_version}.'
+
+mmseg_minimum_version = '0.20.0'
+mmseg_maximum_version = '1.0.0'
+mmseg_version = digit_version(mmseg.__version__)
+assert (mmseg_version >= digit_version(mmseg_minimum_version)
+        and mmseg_version <= digit_version(mmseg_maximum_version)), \
+    f'MMSEG=={mmseg.__version__} is used but incompatible. ' \
+    f'Please install mmseg>={mmseg_minimum_version}, ' \
+    f'<={mmseg_maximum_version}.'
+
+__all__ = ['__version__', 'short_version']
diff --git a/mmdet3d/apis/__init__.py b/mmdet3d/apis/__init__.py
index 5befc10..b768578 100644
--- a/mmdet3d/apis/__init__.py
+++ b/mmdet3d/apis/__init__.py
@@ -1,14 +1,14 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .inference import (convert_SyncBN, inference_detector,
-                        inference_mono_3d_detector,
-                        inference_multi_modality_detector, inference_segmentor,
-                        init_model, show_result_meshlab)
-from .test import single_gpu_test
-from .train import init_random_seed, train_model
-
-__all__ = [
-    'inference_detector', 'init_model', 'single_gpu_test',
-    'inference_mono_3d_detector', 'show_result_meshlab', 'convert_SyncBN',
-    'train_model', 'inference_multi_modality_detector', 'inference_segmentor',
-    'init_random_seed'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .inference import (convert_SyncBN, inference_detector,
+                        inference_mono_3d_detector,
+                        inference_multi_modality_detector, inference_segmentor,
+                        init_model, show_result_meshlab)
+from .test import single_gpu_test
+from .train import init_random_seed, train_model
+
+__all__ = [
+    'inference_detector', 'init_model', 'single_gpu_test',
+    'inference_mono_3d_detector', 'show_result_meshlab', 'convert_SyncBN',
+    'train_model', 'inference_multi_modality_detector', 'inference_segmentor',
+    'init_random_seed'
+]
diff --git a/mmdet3d/apis/inference.py b/mmdet3d/apis/inference.py
index 1457182..cc17e69 100644
--- a/mmdet3d/apis/inference.py
+++ b/mmdet3d/apis/inference.py
@@ -1,526 +1,526 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import re
-from copy import deepcopy
-from os import path as osp
-
-import mmcv
-import numpy as np
-import torch
-from mmcv.parallel import collate, scatter
-from mmcv.runner import load_checkpoint
-
-from mmdet3d.core import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
-                          DepthInstance3DBoxes, LiDARInstance3DBoxes,
-                          show_multi_modality_result, show_result,
-                          show_seg_result)
-from mmdet3d.core.bbox import get_box_type
-from mmdet3d.datasets.pipelines import Compose
-from mmdet3d.models import build_model
-from mmdet3d.utils import get_root_logger
-
-
-def convert_SyncBN(config):
-    """Convert config's naiveSyncBN to BN.
-
-    Args:
-         config (str or :obj:`mmcv.Config`): Config file path or the config
-            object.
-    """
-    if isinstance(config, dict):
-        for item in config:
-            if item == 'norm_cfg':
-                config[item]['type'] = config[item]['type']. \
-                                    replace('naiveSyncBN', 'BN')
-            else:
-                convert_SyncBN(config[item])
-
-
-def init_model(config, checkpoint=None, device='cuda:0'):
-    """Initialize a model from config file, which could be a 3D detector or a
-    3D segmentor.
-
-    Args:
-        config (str or :obj:`mmcv.Config`): Config file path or the config
-            object.
-        checkpoint (str, optional): Checkpoint path. If left as None, the model
-            will not load any weights.
-        device (str): Device to use.
-
-    Returns:
-        nn.Module: The constructed detector.
-    """
-    if isinstance(config, str):
-        config = mmcv.Config.fromfile(config)
-    elif not isinstance(config, mmcv.Config):
-        raise TypeError('config must be a filename or Config object, '
-                        f'but got {type(config)}')
-    config.model.pretrained = None
-    convert_SyncBN(config.model)
-    config.model.train_cfg = None
-    model = build_model(config.model, test_cfg=config.get('test_cfg'))
-    if checkpoint is not None:
-        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
-        if 'CLASSES' in checkpoint['meta']:
-            model.CLASSES = checkpoint['meta']['CLASSES']
-        else:
-            model.CLASSES = config.class_names
-        if 'PALETTE' in checkpoint['meta']:  # 3D Segmentor
-            model.PALETTE = checkpoint['meta']['PALETTE']
-    model.cfg = config  # save the config in the model for convenience
-    if device != 'cpu':
-        torch.cuda.set_device(device)
-    else:
-        logger = get_root_logger()
-        logger.warning('Don\'t suggest using CPU device. '
-                       'Some functions are not supported for now.')
-    model.to(device)
-    model.eval()
-    return model
-
-
-def inference_detector(model, pcd):
-    """Inference point cloud with the detector.
-
-    Args:
-        model (nn.Module): The loaded detector.
-        pcd (str): Point cloud files.
-
-    Returns:
-        tuple: Predicted results and data from pipeline.
-    """
-    cfg = model.cfg
-    device = next(model.parameters()).device  # model device
-
-    if not isinstance(pcd, str):
-        cfg = cfg.copy()
-        # set loading pipeline type
-        cfg.data.test.pipeline[0].type = 'LoadPointsFromDict'
-
-    # build the data pipeline
-    test_pipeline = deepcopy(cfg.data.test.pipeline)
-    test_pipeline = Compose(test_pipeline)
-    box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d)
-
-    if isinstance(pcd, str):
-        # load from point clouds file
-        data = dict(
-            pts_filename=pcd,
-            box_type_3d=box_type_3d,
-            box_mode_3d=box_mode_3d,
-            # for ScanNet demo we need axis_align_matrix
-            ann_info=dict(axis_align_matrix=np.eye(4)),
-            sweeps=[],
-            # set timestamp = 0
-            timestamp=[0],
-            img_fields=[],
-            bbox3d_fields=[],
-            pts_mask_fields=[],
-            pts_seg_fields=[],
-            bbox_fields=[],
-            mask_fields=[],
-            seg_fields=[])
-    else:
-        # load from http
-        data = dict(
-            points=pcd,
-            box_type_3d=box_type_3d,
-            box_mode_3d=box_mode_3d,
-            # for ScanNet demo we need axis_align_matrix
-            ann_info=dict(axis_align_matrix=np.eye(4)),
-            sweeps=[],
-            # set timestamp = 0
-            timestamp=[0],
-            img_fields=[],
-            bbox3d_fields=[],
-            pts_mask_fields=[],
-            pts_seg_fields=[],
-            bbox_fields=[],
-            mask_fields=[],
-            seg_fields=[])
-    data = test_pipeline(data)
-    data = collate([data], samples_per_gpu=1)
-    if next(model.parameters()).is_cuda:
-        # scatter to specified GPU
-        data = scatter(data, [device.index])[0]
-    else:
-        # this is a workaround to avoid the bug of MMDataParallel
-        data['img_metas'] = data['img_metas'][0].data
-        data['points'] = data['points'][0].data
-    # forward the model
-    with torch.no_grad():
-        result = model(return_loss=False, rescale=True, **data)
-    return result, data
-
-
-def inference_multi_modality_detector(model, pcd, image, ann_file):
-    """Inference point cloud with the multi-modality detector.
-
-    Args:
-        model (nn.Module): The loaded detector.
-        pcd (str): Point cloud files.
-        image (str): Image files.
-        ann_file (str): Annotation files.
-
-    Returns:
-        tuple: Predicted results and data from pipeline.
-    """
-    cfg = model.cfg
-    device = next(model.parameters()).device  # model device
-    # build the data pipeline
-    test_pipeline = deepcopy(cfg.data.test.pipeline)
-    test_pipeline = Compose(test_pipeline)
-    box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d)
-    # get data info containing calib
-    data_infos = mmcv.load(ann_file)
-    image_idx = int(re.findall(r'\d+', image)[-1])  # xxx/sunrgbd_000017.jpg
-    for x in data_infos:
-        if int(x['image']['image_idx']) != image_idx:
-            continue
-        info = x
-        break
-    data = dict(
-        pts_filename=pcd,
-        img_prefix=osp.dirname(image),
-        img_info=dict(filename=osp.basename(image)),
-        box_type_3d=box_type_3d,
-        box_mode_3d=box_mode_3d,
-        img_fields=[],
-        bbox3d_fields=[],
-        pts_mask_fields=[],
-        pts_seg_fields=[],
-        bbox_fields=[],
-        mask_fields=[],
-        seg_fields=[])
-    data = test_pipeline(data)
-
-    # TODO: this code is dataset-specific. Move lidar2img and
-    #       depth2img to .pkl annotations in the future.
-    # LiDAR to image conversion
-    if box_mode_3d == Box3DMode.LIDAR:
-        rect = info['calib']['R0_rect'].astype(np.float32)
-        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
-        P2 = info['calib']['P2'].astype(np.float32)
-        lidar2img = P2 @ rect @ Trv2c
-        data['img_metas'][0].data['lidar2img'] = lidar2img
-    # Depth to image conversion
-    elif box_mode_3d == Box3DMode.DEPTH:
-        rt_mat = info['calib']['Rt']
-        # follow Coord3DMode.convert_point
-        rt_mat = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]
-                           ]) @ rt_mat.transpose(1, 0)
-        depth2img = info['calib']['K'] @ rt_mat
-        data['img_metas'][0].data['depth2img'] = depth2img
-
-    data = collate([data], samples_per_gpu=1)
-    if next(model.parameters()).is_cuda:
-        # scatter to specified GPU
-        data = scatter(data, [device.index])[0]
-    else:
-        # this is a workaround to avoid the bug of MMDataParallel
-        data['img_metas'] = data['img_metas'][0].data
-        data['points'] = data['points'][0].data
-        data['img'] = data['img'][0].data
-
-    # forward the model
-    with torch.no_grad():
-        result = model(return_loss=False, rescale=True, **data)
-    return result, data
-
-
-def inference_mono_3d_detector(model, image, ann_file):
-    """Inference image with the monocular 3D detector.
-
-    Args:
-        model (nn.Module): The loaded detector.
-        image (str): Image files.
-        ann_file (str): Annotation files.
-
-    Returns:
-        tuple: Predicted results and data from pipeline.
-    """
-    cfg = model.cfg
-    device = next(model.parameters()).device  # model device
-    # build the data pipeline
-    test_pipeline = deepcopy(cfg.data.test.pipeline)
-    test_pipeline = Compose(test_pipeline)
-    box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d)
-    # get data info containing calib
-    data_infos = mmcv.load(ann_file)
-    # find the info corresponding to this image
-    for x in data_infos['images']:
-        if osp.basename(x['file_name']) != osp.basename(image):
-            continue
-        img_info = x
-        break
-    data = dict(
-        img_prefix=osp.dirname(image),
-        img_info=dict(filename=osp.basename(image)),
-        box_type_3d=box_type_3d,
-        box_mode_3d=box_mode_3d,
-        img_fields=[],
-        bbox3d_fields=[],
-        pts_mask_fields=[],
-        pts_seg_fields=[],
-        bbox_fields=[],
-        mask_fields=[],
-        seg_fields=[])
-
-    # camera points to image conversion
-    if box_mode_3d == Box3DMode.CAM:
-        data['img_info'].update(dict(cam_intrinsic=img_info['cam_intrinsic']))
-
-    data = test_pipeline(data)
-
-    data = collate([data], samples_per_gpu=1)
-    if next(model.parameters()).is_cuda:
-        # scatter to specified GPU
-        data = scatter(data, [device.index])[0]
-    else:
-        # this is a workaround to avoid the bug of MMDataParallel
-        data['img_metas'] = data['img_metas'][0].data
-        data['img'] = data['img'][0].data
-
-    # forward the model
-    with torch.no_grad():
-        result = model(return_loss=False, rescale=True, **data)
-    return result, data
-
-
-def inference_segmentor(model, pcd):
-    """Inference point cloud with the segmentor.
-
-    Args:
-        model (nn.Module): The loaded segmentor.
-        pcd (str): Point cloud files.
-
-    Returns:
-        tuple: Predicted results and data from pipeline.
-    """
-    cfg = model.cfg
-    device = next(model.parameters()).device  # model device
-    # build the data pipeline
-    test_pipeline = deepcopy(cfg.data.test.pipeline)
-    test_pipeline = Compose(test_pipeline)
-    data = dict(
-        pts_filename=pcd,
-        img_fields=[],
-        bbox3d_fields=[],
-        pts_mask_fields=[],
-        pts_seg_fields=[],
-        bbox_fields=[],
-        mask_fields=[],
-        seg_fields=[])
-    data = test_pipeline(data)
-    data = collate([data], samples_per_gpu=1)
-    if next(model.parameters()).is_cuda:
-        # scatter to specified GPU
-        data = scatter(data, [device.index])[0]
-    else:
-        # this is a workaround to avoid the bug of MMDataParallel
-        data['img_metas'] = data['img_metas'][0].data
-        data['points'] = data['points'][0].data
-    # forward the model
-    with torch.no_grad():
-        result = model(return_loss=False, rescale=True, **data)
-    return result, data
-
-
-def show_det_result_meshlab(data,
-                            result,
-                            out_dir,
-                            score_thr=0.0,
-                            show=False,
-                            snapshot=False):
-    """Show 3D detection result by meshlab."""
-    points = data['points'][0][0].cpu().numpy()
-    pts_filename = data['img_metas'][0][0]['pts_filename']
-    file_name = osp.split(pts_filename)[-1].split('.')[0]
-
-    if 'pts_bbox' in result[0].keys():
-        pred_bboxes = result[0]['pts_bbox']['boxes_3d'].tensor.numpy()
-        pred_scores = result[0]['pts_bbox']['scores_3d'].numpy()
-    else:
-        pred_bboxes = result[0]['boxes_3d'].tensor.numpy()
-        pred_scores = result[0]['scores_3d'].numpy()
-
-    # filter out low score bboxes for visualization
-    if score_thr > 0:
-        inds = pred_scores > score_thr
-        pred_bboxes = pred_bboxes[inds]
-
-    # for now we convert points into depth mode
-    box_mode = data['img_metas'][0][0]['box_mode_3d']
-    if box_mode != Box3DMode.DEPTH:
-        points = Coord3DMode.convert(points, box_mode, Coord3DMode.DEPTH)
-        show_bboxes = Box3DMode.convert(pred_bboxes, box_mode, Box3DMode.DEPTH)
-    else:
-        show_bboxes = deepcopy(pred_bboxes)
-
-    show_result(
-        points,
-        None,
-        show_bboxes,
-        out_dir,
-        file_name,
-        show=show,
-        snapshot=snapshot)
-
-    return file_name
-
-
-def show_seg_result_meshlab(data,
-                            result,
-                            out_dir,
-                            palette,
-                            show=False,
-                            snapshot=False):
-    """Show 3D segmentation result by meshlab."""
-    points = data['points'][0][0].cpu().numpy()
-    pts_filename = data['img_metas'][0][0]['pts_filename']
-    file_name = osp.split(pts_filename)[-1].split('.')[0]
-
-    pred_seg = result[0]['semantic_mask'].numpy()
-
-    if palette is None:
-        # generate random color map
-        max_idx = pred_seg.max()
-        palette = np.random.randint(0, 256, size=(max_idx + 1, 3))
-    palette = np.array(palette).astype(np.int)
-
-    show_seg_result(
-        points,
-        None,
-        pred_seg,
-        out_dir,
-        file_name,
-        palette=palette,
-        show=show,
-        snapshot=snapshot)
-
-    return file_name
-
-
-def show_proj_det_result_meshlab(data,
-                                 result,
-                                 out_dir,
-                                 score_thr=0.0,
-                                 show=False,
-                                 snapshot=False):
-    """Show result of projecting 3D bbox to 2D image by meshlab."""
-    assert 'img' in data.keys(), 'image data is not provided for visualization'
-
-    img_filename = data['img_metas'][0][0]['filename']
-    file_name = osp.split(img_filename)[-1].split('.')[0]
-
-    # read from file because img in data_dict has undergone pipeline transform
-    img = mmcv.imread(img_filename)
-
-    if 'pts_bbox' in result[0].keys():
-        result[0] = result[0]['pts_bbox']
-    elif 'img_bbox' in result[0].keys():
-        result[0] = result[0]['img_bbox']
-    pred_bboxes = result[0]['boxes_3d'].tensor.numpy()
-    pred_scores = result[0]['scores_3d'].numpy()
-
-    # filter out low score bboxes for visualization
-    if score_thr > 0:
-        inds = pred_scores > score_thr
-        pred_bboxes = pred_bboxes[inds]
-
-    box_mode = data['img_metas'][0][0]['box_mode_3d']
-    if box_mode == Box3DMode.LIDAR:
-        if 'lidar2img' not in data['img_metas'][0][0]:
-            raise NotImplementedError(
-                'LiDAR to image transformation matrix is not provided')
-
-        show_bboxes = LiDARInstance3DBoxes(pred_bboxes, origin=(0.5, 0.5, 0))
-
-        show_multi_modality_result(
-            img,
-            None,
-            show_bboxes,
-            data['img_metas'][0][0]['lidar2img'],
-            out_dir,
-            file_name,
-            box_mode='lidar',
-            show=show)
-    elif box_mode == Box3DMode.DEPTH:
-        show_bboxes = DepthInstance3DBoxes(pred_bboxes, origin=(0.5, 0.5, 0))
-
-        show_multi_modality_result(
-            img,
-            None,
-            show_bboxes,
-            None,
-            out_dir,
-            file_name,
-            box_mode='depth',
-            img_metas=data['img_metas'][0][0],
-            show=show)
-    elif box_mode == Box3DMode.CAM:
-        if 'cam2img' not in data['img_metas'][0][0]:
-            raise NotImplementedError(
-                'camera intrinsic matrix is not provided')
-
-        show_bboxes = CameraInstance3DBoxes(
-            pred_bboxes, box_dim=pred_bboxes.shape[-1], origin=(0.5, 1.0, 0.5))
-
-        show_multi_modality_result(
-            img,
-            None,
-            show_bboxes,
-            data['img_metas'][0][0]['cam2img'],
-            out_dir,
-            file_name,
-            box_mode='camera',
-            show=show)
-    else:
-        raise NotImplementedError(
-            f'visualization of {box_mode} bbox is not supported')
-
-    return file_name
-
-
-def show_result_meshlab(data,
-                        result,
-                        out_dir,
-                        score_thr=0.0,
-                        show=False,
-                        snapshot=False,
-                        task='det',
-                        palette=None):
-    """Show result by meshlab.
-
-    Args:
-        data (dict): Contain data from pipeline.
-        result (dict): Predicted result from model.
-        out_dir (str): Directory to save visualized result.
-        score_thr (float, optional): Minimum score of bboxes to be shown.
-            Default: 0.0
-        show (bool, optional): Visualize the results online. Defaults to False.
-        snapshot (bool, optional): Whether to save the online results.
-            Defaults to False.
-        task (str, optional): Distinguish which task result to visualize.
-            Currently we support 3D detection, multi-modality detection and
-            3D segmentation. Defaults to 'det'.
-        palette (list[list[int]]] | np.ndarray, optional): The palette
-            of segmentation map. If None is given, random palette will be
-            generated. Defaults to None.
-    """
-    assert task in ['det', 'multi_modality-det', 'seg', 'mono-det'], \
-        f'unsupported visualization task {task}'
-    assert out_dir is not None, 'Expect out_dir, got none.'
-
-    if task in ['det', 'multi_modality-det']:
-        file_name = show_det_result_meshlab(data, result, out_dir, score_thr,
-                                            show, snapshot)
-
-    if task in ['seg']:
-        file_name = show_seg_result_meshlab(data, result, out_dir, palette,
-                                            show, snapshot)
-
-    if task in ['multi_modality-det', 'mono-det']:
-        file_name = show_proj_det_result_meshlab(data, result, out_dir,
-                                                 score_thr, show, snapshot)
-
-    return out_dir, file_name
+# Copyright (c) OpenMMLab. All rights reserved.
+import re
+from copy import deepcopy
+from os import path as osp
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import collate, scatter
+from mmcv.runner import load_checkpoint
+
+from mmdet3d.core import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
+                          DepthInstance3DBoxes, LiDARInstance3DBoxes,
+                          show_multi_modality_result, show_result,
+                          show_seg_result)
+from mmdet3d.core.bbox import get_box_type
+from mmdet3d.datasets.pipelines import Compose
+from mmdet3d.models import build_model
+from mmdet3d.utils import get_root_logger
+
+
+def convert_SyncBN(config):
+    """Convert config's naiveSyncBN to BN.
+
+    Args:
+         config (str or :obj:`mmcv.Config`): Config file path or the config
+            object.
+    """
+    if isinstance(config, dict):
+        for item in config:
+            if item == 'norm_cfg':
+                config[item]['type'] = config[item]['type']. \
+                                    replace('naiveSyncBN', 'BN')
+            else:
+                convert_SyncBN(config[item])
+
+
+def init_model(config, checkpoint=None, device='cuda:0'):
+    """Initialize a model from config file, which could be a 3D detector or a
+    3D segmentor.
+
+    Args:
+        config (str or :obj:`mmcv.Config`): Config file path or the config
+            object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+        device (str): Device to use.
+
+    Returns:
+        nn.Module: The constructed detector.
+    """
+    if isinstance(config, str):
+        config = mmcv.Config.fromfile(config)
+    elif not isinstance(config, mmcv.Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    config.model.pretrained = None
+    convert_SyncBN(config.model)
+    config.model.train_cfg = None
+    model = build_model(config.model, test_cfg=config.get('test_cfg'))
+    if checkpoint is not None:
+        checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
+        if 'CLASSES' in checkpoint['meta']:
+            model.CLASSES = checkpoint['meta']['CLASSES']
+        else:
+            model.CLASSES = config.class_names
+        if 'PALETTE' in checkpoint['meta']:  # 3D Segmentor
+            model.PALETTE = checkpoint['meta']['PALETTE']
+    model.cfg = config  # save the config in the model for convenience
+    if device != 'cpu':
+        torch.cuda.set_device(device)
+    else:
+        logger = get_root_logger()
+        logger.warning('Don\'t suggest using CPU device. '
+                       'Some functions are not supported for now.')
+    model.to(device)
+    model.eval()
+    return model
+
+
+def inference_detector(model, pcd):
+    """Inference point cloud with the detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        pcd (str): Point cloud files.
+
+    Returns:
+        tuple: Predicted results and data from pipeline.
+    """
+    cfg = model.cfg
+    device = next(model.parameters()).device  # model device
+
+    if not isinstance(pcd, str):
+        cfg = cfg.copy()
+        # set loading pipeline type
+        cfg.data.test.pipeline[0].type = 'LoadPointsFromDict'
+
+    # build the data pipeline
+    test_pipeline = deepcopy(cfg.data.test.pipeline)
+    test_pipeline = Compose(test_pipeline)
+    box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d)
+
+    if isinstance(pcd, str):
+        # load from point clouds file
+        data = dict(
+            pts_filename=pcd,
+            box_type_3d=box_type_3d,
+            box_mode_3d=box_mode_3d,
+            # for ScanNet demo we need axis_align_matrix
+            ann_info=dict(axis_align_matrix=np.eye(4)),
+            sweeps=[],
+            # set timestamp = 0
+            timestamp=[0],
+            img_fields=[],
+            bbox3d_fields=[],
+            pts_mask_fields=[],
+            pts_seg_fields=[],
+            bbox_fields=[],
+            mask_fields=[],
+            seg_fields=[])
+    else:
+        # load from http
+        data = dict(
+            points=pcd,
+            box_type_3d=box_type_3d,
+            box_mode_3d=box_mode_3d,
+            # for ScanNet demo we need axis_align_matrix
+            ann_info=dict(axis_align_matrix=np.eye(4)),
+            sweeps=[],
+            # set timestamp = 0
+            timestamp=[0],
+            img_fields=[],
+            bbox3d_fields=[],
+            pts_mask_fields=[],
+            pts_seg_fields=[],
+            bbox_fields=[],
+            mask_fields=[],
+            seg_fields=[])
+    data = test_pipeline(data)
+    data = collate([data], samples_per_gpu=1)
+    if next(model.parameters()).is_cuda:
+        # scatter to specified GPU
+        data = scatter(data, [device.index])[0]
+    else:
+        # this is a workaround to avoid the bug of MMDataParallel
+        data['img_metas'] = data['img_metas'][0].data
+        data['points'] = data['points'][0].data
+    # forward the model
+    with torch.no_grad():
+        result = model(return_loss=False, rescale=True, **data)
+    return result, data
+
+
+def inference_multi_modality_detector(model, pcd, image, ann_file):
+    """Inference point cloud with the multi-modality detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        pcd (str): Point cloud files.
+        image (str): Image files.
+        ann_file (str): Annotation files.
+
+    Returns:
+        tuple: Predicted results and data from pipeline.
+    """
+    cfg = model.cfg
+    device = next(model.parameters()).device  # model device
+    # build the data pipeline
+    test_pipeline = deepcopy(cfg.data.test.pipeline)
+    test_pipeline = Compose(test_pipeline)
+    box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d)
+    # get data info containing calib
+    data_infos = mmcv.load(ann_file)
+    image_idx = int(re.findall(r'\d+', image)[-1])  # xxx/sunrgbd_000017.jpg
+    for x in data_infos:
+        if int(x['image']['image_idx']) != image_idx:
+            continue
+        info = x
+        break
+    data = dict(
+        pts_filename=pcd,
+        img_prefix=osp.dirname(image),
+        img_info=dict(filename=osp.basename(image)),
+        box_type_3d=box_type_3d,
+        box_mode_3d=box_mode_3d,
+        img_fields=[],
+        bbox3d_fields=[],
+        pts_mask_fields=[],
+        pts_seg_fields=[],
+        bbox_fields=[],
+        mask_fields=[],
+        seg_fields=[])
+    data = test_pipeline(data)
+
+    # TODO: this code is dataset-specific. Move lidar2img and
+    #       depth2img to .pkl annotations in the future.
+    # LiDAR to image conversion
+    if box_mode_3d == Box3DMode.LIDAR:
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        P2 = info['calib']['P2'].astype(np.float32)
+        lidar2img = P2 @ rect @ Trv2c
+        data['img_metas'][0].data['lidar2img'] = lidar2img
+    # Depth to image conversion
+    elif box_mode_3d == Box3DMode.DEPTH:
+        rt_mat = info['calib']['Rt']
+        # follow Coord3DMode.convert_point
+        rt_mat = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]
+                           ]) @ rt_mat.transpose(1, 0)
+        depth2img = info['calib']['K'] @ rt_mat
+        data['img_metas'][0].data['depth2img'] = depth2img
+
+    data = collate([data], samples_per_gpu=1)
+    if next(model.parameters()).is_cuda:
+        # scatter to specified GPU
+        data = scatter(data, [device.index])[0]
+    else:
+        # this is a workaround to avoid the bug of MMDataParallel
+        data['img_metas'] = data['img_metas'][0].data
+        data['points'] = data['points'][0].data
+        data['img'] = data['img'][0].data
+
+    # forward the model
+    with torch.no_grad():
+        result = model(return_loss=False, rescale=True, **data)
+    return result, data
+
+
+def inference_mono_3d_detector(model, image, ann_file):
+    """Inference image with the monocular 3D detector.
+
+    Args:
+        model (nn.Module): The loaded detector.
+        image (str): Image files.
+        ann_file (str): Annotation files.
+
+    Returns:
+        tuple: Predicted results and data from pipeline.
+    """
+    cfg = model.cfg
+    device = next(model.parameters()).device  # model device
+    # build the data pipeline
+    test_pipeline = deepcopy(cfg.data.test.pipeline)
+    test_pipeline = Compose(test_pipeline)
+    box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d)
+    # get data info containing calib
+    data_infos = mmcv.load(ann_file)
+    # find the info corresponding to this image
+    for x in data_infos['images']:
+        if osp.basename(x['file_name']) != osp.basename(image):
+            continue
+        img_info = x
+        break
+    data = dict(
+        img_prefix=osp.dirname(image),
+        img_info=dict(filename=osp.basename(image)),
+        box_type_3d=box_type_3d,
+        box_mode_3d=box_mode_3d,
+        img_fields=[],
+        bbox3d_fields=[],
+        pts_mask_fields=[],
+        pts_seg_fields=[],
+        bbox_fields=[],
+        mask_fields=[],
+        seg_fields=[])
+
+    # camera points to image conversion
+    if box_mode_3d == Box3DMode.CAM:
+        data['img_info'].update(dict(cam_intrinsic=img_info['cam_intrinsic']))
+
+    data = test_pipeline(data)
+
+    data = collate([data], samples_per_gpu=1)
+    if next(model.parameters()).is_cuda:
+        # scatter to specified GPU
+        data = scatter(data, [device.index])[0]
+    else:
+        # this is a workaround to avoid the bug of MMDataParallel
+        data['img_metas'] = data['img_metas'][0].data
+        data['img'] = data['img'][0].data
+
+    # forward the model
+    with torch.no_grad():
+        result = model(return_loss=False, rescale=True, **data)
+    return result, data
+
+
+def inference_segmentor(model, pcd):
+    """Inference point cloud with the segmentor.
+
+    Args:
+        model (nn.Module): The loaded segmentor.
+        pcd (str): Point cloud files.
+
+    Returns:
+        tuple: Predicted results and data from pipeline.
+    """
+    cfg = model.cfg
+    device = next(model.parameters()).device  # model device
+    # build the data pipeline
+    test_pipeline = deepcopy(cfg.data.test.pipeline)
+    test_pipeline = Compose(test_pipeline)
+    data = dict(
+        pts_filename=pcd,
+        img_fields=[],
+        bbox3d_fields=[],
+        pts_mask_fields=[],
+        pts_seg_fields=[],
+        bbox_fields=[],
+        mask_fields=[],
+        seg_fields=[])
+    data = test_pipeline(data)
+    data = collate([data], samples_per_gpu=1)
+    if next(model.parameters()).is_cuda:
+        # scatter to specified GPU
+        data = scatter(data, [device.index])[0]
+    else:
+        # this is a workaround to avoid the bug of MMDataParallel
+        data['img_metas'] = data['img_metas'][0].data
+        data['points'] = data['points'][0].data
+    # forward the model
+    with torch.no_grad():
+        result = model(return_loss=False, rescale=True, **data)
+    return result, data
+
+
+def show_det_result_meshlab(data,
+                            result,
+                            out_dir,
+                            score_thr=0.0,
+                            show=False,
+                            snapshot=False):
+    """Show 3D detection result by meshlab."""
+    points = data['points'][0][0].cpu().numpy()
+    pts_filename = data['img_metas'][0][0]['pts_filename']
+    file_name = osp.split(pts_filename)[-1].split('.')[0]
+
+    if 'pts_bbox' in result[0].keys():
+        pred_bboxes = result[0]['pts_bbox']['boxes_3d'].tensor.numpy()
+        pred_scores = result[0]['pts_bbox']['scores_3d'].numpy()
+    else:
+        pred_bboxes = result[0]['boxes_3d'].tensor.numpy()
+        pred_scores = result[0]['scores_3d'].numpy()
+
+    # filter out low score bboxes for visualization
+    if score_thr > 0:
+        inds = pred_scores > score_thr
+        pred_bboxes = pred_bboxes[inds]
+
+    # for now we convert points into depth mode
+    box_mode = data['img_metas'][0][0]['box_mode_3d']
+    if box_mode != Box3DMode.DEPTH:
+        points = Coord3DMode.convert(points, box_mode, Coord3DMode.DEPTH)
+        show_bboxes = Box3DMode.convert(pred_bboxes, box_mode, Box3DMode.DEPTH)
+    else:
+        show_bboxes = deepcopy(pred_bboxes)
+
+    show_result(
+        points,
+        None,
+        show_bboxes,
+        out_dir,
+        file_name,
+        show=show,
+        snapshot=snapshot)
+
+    return file_name
+
+
+def show_seg_result_meshlab(data,
+                            result,
+                            out_dir,
+                            palette,
+                            show=False,
+                            snapshot=False):
+    """Show 3D segmentation result by meshlab."""
+    points = data['points'][0][0].cpu().numpy()
+    pts_filename = data['img_metas'][0][0]['pts_filename']
+    file_name = osp.split(pts_filename)[-1].split('.')[0]
+
+    pred_seg = result[0]['semantic_mask'].numpy()
+
+    if palette is None:
+        # generate random color map
+        max_idx = pred_seg.max()
+        palette = np.random.randint(0, 256, size=(max_idx + 1, 3))
+    palette = np.array(palette).astype(np.int)
+
+    show_seg_result(
+        points,
+        None,
+        pred_seg,
+        out_dir,
+        file_name,
+        palette=palette,
+        show=show,
+        snapshot=snapshot)
+
+    return file_name
+
+
+def show_proj_det_result_meshlab(data,
+                                 result,
+                                 out_dir,
+                                 score_thr=0.0,
+                                 show=False,
+                                 snapshot=False):
+    """Show result of projecting 3D bbox to 2D image by meshlab."""
+    assert 'img' in data.keys(), 'image data is not provided for visualization'
+
+    img_filename = data['img_metas'][0][0]['filename']
+    file_name = osp.split(img_filename)[-1].split('.')[0]
+
+    # read from file because img in data_dict has undergone pipeline transform
+    img = mmcv.imread(img_filename)
+
+    if 'pts_bbox' in result[0].keys():
+        result[0] = result[0]['pts_bbox']
+    elif 'img_bbox' in result[0].keys():
+        result[0] = result[0]['img_bbox']
+    pred_bboxes = result[0]['boxes_3d'].tensor.numpy()
+    pred_scores = result[0]['scores_3d'].numpy()
+
+    # filter out low score bboxes for visualization
+    if score_thr > 0:
+        inds = pred_scores > score_thr
+        pred_bboxes = pred_bboxes[inds]
+
+    box_mode = data['img_metas'][0][0]['box_mode_3d']
+    if box_mode == Box3DMode.LIDAR:
+        if 'lidar2img' not in data['img_metas'][0][0]:
+            raise NotImplementedError(
+                'LiDAR to image transformation matrix is not provided')
+
+        show_bboxes = LiDARInstance3DBoxes(pred_bboxes, origin=(0.5, 0.5, 0))
+
+        show_multi_modality_result(
+            img,
+            None,
+            show_bboxes,
+            data['img_metas'][0][0]['lidar2img'],
+            out_dir,
+            file_name,
+            box_mode='lidar',
+            show=show)
+    elif box_mode == Box3DMode.DEPTH:
+        show_bboxes = DepthInstance3DBoxes(pred_bboxes, origin=(0.5, 0.5, 0))
+
+        show_multi_modality_result(
+            img,
+            None,
+            show_bboxes,
+            None,
+            out_dir,
+            file_name,
+            box_mode='depth',
+            img_metas=data['img_metas'][0][0],
+            show=show)
+    elif box_mode == Box3DMode.CAM:
+        if 'cam2img' not in data['img_metas'][0][0]:
+            raise NotImplementedError(
+                'camera intrinsic matrix is not provided')
+
+        show_bboxes = CameraInstance3DBoxes(
+            pred_bboxes, box_dim=pred_bboxes.shape[-1], origin=(0.5, 1.0, 0.5))
+
+        show_multi_modality_result(
+            img,
+            None,
+            show_bboxes,
+            data['img_metas'][0][0]['cam2img'],
+            out_dir,
+            file_name,
+            box_mode='camera',
+            show=show)
+    else:
+        raise NotImplementedError(
+            f'visualization of {box_mode} bbox is not supported')
+
+    return file_name
+
+
+def show_result_meshlab(data,
+                        result,
+                        out_dir,
+                        score_thr=0.0,
+                        show=False,
+                        snapshot=False,
+                        task='det',
+                        palette=None):
+    """Show result by meshlab.
+
+    Args:
+        data (dict): Contain data from pipeline.
+        result (dict): Predicted result from model.
+        out_dir (str): Directory to save visualized result.
+        score_thr (float, optional): Minimum score of bboxes to be shown.
+            Default: 0.0
+        show (bool, optional): Visualize the results online. Defaults to False.
+        snapshot (bool, optional): Whether to save the online results.
+            Defaults to False.
+        task (str, optional): Distinguish which task result to visualize.
+            Currently we support 3D detection, multi-modality detection and
+            3D segmentation. Defaults to 'det'.
+        palette (list[list[int]]] | np.ndarray, optional): The palette
+            of segmentation map. If None is given, random palette will be
+            generated. Defaults to None.
+    """
+    assert task in ['det', 'multi_modality-det', 'seg', 'mono-det'], \
+        f'unsupported visualization task {task}'
+    assert out_dir is not None, 'Expect out_dir, got none.'
+
+    if task in ['det', 'multi_modality-det']:
+        file_name = show_det_result_meshlab(data, result, out_dir, score_thr,
+                                            show, snapshot)
+
+    if task in ['seg']:
+        file_name = show_seg_result_meshlab(data, result, out_dir, palette,
+                                            show, snapshot)
+
+    if task in ['multi_modality-det', 'mono-det']:
+        file_name = show_proj_det_result_meshlab(data, result, out_dir,
+                                                 score_thr, show, snapshot)
+
+    return out_dir, file_name
diff --git a/mmdet3d/apis/test.py b/mmdet3d/apis/test.py
index c0e66c0..206037c 100644
--- a/mmdet3d/apis/test.py
+++ b/mmdet3d/apis/test.py
@@ -1,90 +1,90 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from os import path as osp
-
-import mmcv
-import torch
-from mmcv.image import tensor2imgs
-
-from mmdet3d.models import (Base3DDetector, Base3DSegmentor,
-                            SingleStageMono3DDetector)
-
-
-def single_gpu_test(model,
-                    data_loader,
-                    show=False,
-                    out_dir=None,
-                    show_score_thr=0.3):
-    """Test model with single gpu.
-
-    This method tests model with single gpu and gives the 'show' option.
-    By setting ``show=True``, it saves the visualization results under
-    ``out_dir``.
-
-    Args:
-        model (nn.Module): Model to be tested.
-        data_loader (nn.Dataloader): Pytorch data loader.
-        show (bool, optional): Whether to save viualization results.
-            Default: True.
-        out_dir (str, optional): The path to save visualization results.
-            Default: None.
-
-    Returns:
-        list[dict]: The prediction results.
-    """
-    model.eval()
-    results = []
-    dataset = data_loader.dataset
-    prog_bar = mmcv.ProgressBar(len(dataset))
-    for i, data in enumerate(data_loader):
-        with torch.no_grad():
-            result = model(return_loss=False, rescale=True, **data)
-
-        if show:
-            # Visualize the results of MMDetection3D model
-            # 'show_results' is MMdetection3D visualization API
-            models_3d = (Base3DDetector, Base3DSegmentor,
-                         SingleStageMono3DDetector)
-            if isinstance(model.module, models_3d):
-                model.module.show_results(
-                    data,
-                    result,
-                    out_dir=out_dir,
-                    show=show,
-                    score_thr=show_score_thr)
-            # Visualize the results of MMDetection model
-            # 'show_result' is MMdetection visualization API
-            else:
-                batch_size = len(result)
-                if batch_size == 1 and isinstance(data['img'][0],
-                                                  torch.Tensor):
-                    img_tensor = data['img'][0]
-                else:
-                    img_tensor = data['img'][0].data[0]
-                img_metas = data['img_metas'][0].data[0]
-                imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
-                assert len(imgs) == len(img_metas)
-
-                for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
-                    h, w, _ = img_meta['img_shape']
-                    img_show = img[:h, :w, :]
-
-                    ori_h, ori_w = img_meta['ori_shape'][:-1]
-                    img_show = mmcv.imresize(img_show, (ori_w, ori_h))
-
-                    if out_dir:
-                        out_file = osp.join(out_dir, img_meta['ori_filename'])
-                    else:
-                        out_file = None
-
-                    model.module.show_result(
-                        img_show,
-                        result[i],
-                        show=show,
-                        out_file=out_file,
-                        score_thr=show_score_thr)
-        results.extend(result)
-
-        batch_size = len(result)
-        for _ in range(batch_size):
-            prog_bar.update()
-    return results
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+
+import mmcv
+import torch
+from mmcv.image import tensor2imgs
+
+from mmdet3d.models import (Base3DDetector, Base3DSegmentor,
+                            SingleStageMono3DDetector)
+
+
+def single_gpu_test(model,
+                    data_loader,
+                    show=False,
+                    out_dir=None,
+                    show_score_thr=0.3):
+    """Test model with single gpu.
+
+    This method tests model with single gpu and gives the 'show' option.
+    By setting ``show=True``, it saves the visualization results under
+    ``out_dir``.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        show (bool, optional): Whether to save viualization results.
+            Default: True.
+        out_dir (str, optional): The path to save visualization results.
+            Default: None.
+
+    Returns:
+        list[dict]: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+
+        if show:
+            # Visualize the results of MMDetection3D model
+            # 'show_results' is MMdetection3D visualization API
+            models_3d = (Base3DDetector, Base3DSegmentor,
+                         SingleStageMono3DDetector)
+            if isinstance(model.module, models_3d):
+                model.module.show_results(
+                    data,
+                    result,
+                    out_dir=out_dir,
+                    show=show,
+                    score_thr=show_score_thr)
+            # Visualize the results of MMDetection model
+            # 'show_result' is MMdetection visualization API
+            else:
+                batch_size = len(result)
+                if batch_size == 1 and isinstance(data['img'][0],
+                                                  torch.Tensor):
+                    img_tensor = data['img'][0]
+                else:
+                    img_tensor = data['img'][0].data[0]
+                img_metas = data['img_metas'][0].data[0]
+                imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
+                assert len(imgs) == len(img_metas)
+
+                for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
+                    h, w, _ = img_meta['img_shape']
+                    img_show = img[:h, :w, :]
+
+                    ori_h, ori_w = img_meta['ori_shape'][:-1]
+                    img_show = mmcv.imresize(img_show, (ori_w, ori_h))
+
+                    if out_dir:
+                        out_file = osp.join(out_dir, img_meta['ori_filename'])
+                    else:
+                        out_file = None
+
+                    model.module.show_result(
+                        img_show,
+                        result[i],
+                        show=show,
+                        out_file=out_file,
+                        score_thr=show_score_thr)
+        results.extend(result)
+
+        batch_size = len(result)
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
diff --git a/mmdet3d/apis/train.py b/mmdet3d/apis/train.py
index 4d97026..3dc16ac 100644
--- a/mmdet3d/apis/train.py
+++ b/mmdet3d/apis/train.py
@@ -1,351 +1,351 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import random
-import warnings
-
-import numpy as np
-import torch
-from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
-from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
-                         Fp16OptimizerHook, OptimizerHook, build_optimizer,
-                         build_runner, get_dist_info)
-from mmcv.utils import build_from_cfg
-from torch import distributed as dist
-
-from mmdet3d.datasets import build_dataset
-from mmdet3d.utils import find_latest_checkpoint
-from mmdet.core import DistEvalHook as MMDET_DistEvalHook
-from mmdet.core import EvalHook as MMDET_EvalHook
-from mmdet.datasets import build_dataloader as build_mmdet_dataloader
-from mmdet.datasets import replace_ImageToTensor
-from mmdet.utils import get_root_logger as get_mmdet_root_logger
-from mmseg.core import DistEvalHook as MMSEG_DistEvalHook
-from mmseg.core import EvalHook as MMSEG_EvalHook
-from mmseg.datasets import build_dataloader as build_mmseg_dataloader
-from mmseg.utils import get_root_logger as get_mmseg_root_logger
-
-
-def init_random_seed(seed=None, device='cuda'):
-    """Initialize random seed.
-
-    If the seed is not set, the seed will be automatically randomized,
-    and then broadcast to all processes to prevent some potential bugs.
-    Args:
-        seed (int, optional): The seed. Default to None.
-        device (str, optional): The device where the seed will be put on.
-            Default to 'cuda'.
-    Returns:
-        int: Seed to be used.
-    """
-    if seed is not None:
-        return seed
-
-    # Make sure all ranks share the same random seed to prevent
-    # some potential bugs. Please refer to
-    # https://github.com/open-mmlab/mmdetection/issues/6339
-    rank, world_size = get_dist_info()
-    seed = np.random.randint(2**31)
-    if world_size == 1:
-        return seed
-
-    if rank == 0:
-        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
-    else:
-        random_num = torch.tensor(0, dtype=torch.int32, device=device)
-    dist.broadcast(random_num, src=0)
-    return random_num.item()
-
-
-def set_random_seed(seed, deterministic=False):
-    """Set random seed.
-
-    Args:
-        seed (int): Seed to be used.
-        deterministic (bool): Whether to set the deterministic option for
-            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
-            to True and `torch.backends.cudnn.benchmark` to False.
-            Default: False.
-    """
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    if deterministic:
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False
-
-
-def train_segmentor(model,
-                    dataset,
-                    cfg,
-                    distributed=False,
-                    validate=False,
-                    timestamp=None,
-                    meta=None):
-    """Launch segmentor training."""
-    logger = get_mmseg_root_logger(cfg.log_level)
-
-    # prepare data loaders
-    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
-    data_loaders = [
-        build_mmseg_dataloader(
-            ds,
-            cfg.data.samples_per_gpu,
-            cfg.data.workers_per_gpu,
-            # cfg.gpus will be ignored if distributed
-            len(cfg.gpu_ids),
-            dist=distributed,
-            seed=cfg.seed,
-            drop_last=True) for ds in dataset
-    ]
-
-    # put model on gpus
-    if distributed:
-        find_unused_parameters = cfg.get('find_unused_parameters', False)
-        # Sets the `find_unused_parameters` parameter in
-        # torch.nn.parallel.DistributedDataParallel
-        model = MMDistributedDataParallel(
-            model.cuda(),
-            device_ids=[torch.cuda.current_device()],
-            broadcast_buffers=False,
-            find_unused_parameters=find_unused_parameters)
-    else:
-        model = MMDataParallel(
-            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
-
-    # build runner
-    optimizer = build_optimizer(model, cfg.optimizer)
-
-    if cfg.get('runner') is None:
-        cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters}
-        warnings.warn(
-            'config is now expected to have a `runner` section, '
-            'please set `runner` in your config.', UserWarning)
-
-    runner = build_runner(
-        cfg.runner,
-        default_args=dict(
-            model=model,
-            batch_processor=None,
-            optimizer=optimizer,
-            work_dir=cfg.work_dir,
-            logger=logger,
-            meta=meta))
-
-    # register hooks
-    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
-                                   cfg.checkpoint_config, cfg.log_config,
-                                   cfg.get('momentum_config', None))
-
-    # an ugly walkaround to make the .log and .log.json filenames the same
-    runner.timestamp = timestamp
-
-    # register eval hooks
-    if validate:
-        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
-        val_dataloader = build_mmseg_dataloader(
-            val_dataset,
-            samples_per_gpu=1,
-            workers_per_gpu=cfg.data.workers_per_gpu,
-            dist=distributed,
-            shuffle=False)
-        eval_cfg = cfg.get('evaluation', {})
-        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
-        eval_hook = MMSEG_DistEvalHook if distributed else MMSEG_EvalHook
-        # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the
-        # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'.
-        runner.register_hook(
-            eval_hook(val_dataloader, **eval_cfg), priority='LOW')
-
-    # user-defined hooks
-    if cfg.get('custom_hooks', None):
-        custom_hooks = cfg.custom_hooks
-        assert isinstance(custom_hooks, list), \
-            f'custom_hooks expect list type, but got {type(custom_hooks)}'
-        for hook_cfg in cfg.custom_hooks:
-            assert isinstance(hook_cfg, dict), \
-                'Each item in custom_hooks expects dict type, but got ' \
-                f'{type(hook_cfg)}'
-            hook_cfg = hook_cfg.copy()
-            priority = hook_cfg.pop('priority', 'NORMAL')
-            hook = build_from_cfg(hook_cfg, HOOKS)
-            runner.register_hook(hook, priority=priority)
-
-    if cfg.resume_from:
-        runner.resume(cfg.resume_from)
-    elif cfg.load_from:
-        runner.load_checkpoint(cfg.load_from)
-    runner.run(data_loaders, cfg.workflow)
-
-
-def train_detector(model,
-                   dataset,
-                   cfg,
-                   distributed=False,
-                   validate=False,
-                   timestamp=None,
-                   meta=None):
-    logger = get_mmdet_root_logger(log_level=cfg.log_level)
-
-    # prepare data loaders
-    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
-    if 'imgs_per_gpu' in cfg.data:
-        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
-                       'Please use "samples_per_gpu" instead')
-        if 'samples_per_gpu' in cfg.data:
-            logger.warning(
-                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
-                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
-                f'={cfg.data.imgs_per_gpu} is used in this experiments')
-        else:
-            logger.warning(
-                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
-                f'{cfg.data.imgs_per_gpu} in this experiments')
-        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
-
-    runner_type = 'EpochBasedRunner' if 'runner' not in cfg else cfg.runner[
-        'type']
-    data_loaders = [
-        build_mmdet_dataloader(
-            ds,
-            cfg.data.samples_per_gpu,
-            cfg.data.workers_per_gpu,
-            # `num_gpus` will be ignored if distributed
-            num_gpus=len(cfg.gpu_ids),
-            dist=distributed,
-            seed=cfg.seed,
-            runner_type=runner_type,
-            persistent_workers=cfg.data.get('persistent_workers', False))
-        for ds in dataset
-    ]
-
-    # put model on gpus
-    if distributed:
-        find_unused_parameters = cfg.get('find_unused_parameters', False)
-        # Sets the `find_unused_parameters` parameter in
-        # torch.nn.parallel.DistributedDataParallel
-        model = MMDistributedDataParallel(
-            model.cuda(),
-            device_ids=[torch.cuda.current_device()],
-            broadcast_buffers=False,
-            find_unused_parameters=find_unused_parameters)
-    else:
-        model = MMDataParallel(
-            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
-
-    # build runner
-    optimizer = build_optimizer(model, cfg.optimizer)
-
-    if 'runner' not in cfg:
-        cfg.runner = {
-            'type': 'EpochBasedRunner',
-            'max_epochs': cfg.total_epochs
-        }
-        warnings.warn(
-            'config is now expected to have a `runner` section, '
-            'please set `runner` in your config.', UserWarning)
-    else:
-        if 'total_epochs' in cfg:
-            assert cfg.total_epochs == cfg.runner.max_epochs
-
-    runner = build_runner(
-        cfg.runner,
-        default_args=dict(
-            model=model,
-            optimizer=optimizer,
-            work_dir=cfg.work_dir,
-            logger=logger,
-            meta=meta))
-
-    # an ugly workaround to make .log and .log.json filenames the same
-    runner.timestamp = timestamp
-
-    # fp16 setting
-    fp16_cfg = cfg.get('fp16', None)
-    if fp16_cfg is not None:
-        optimizer_config = Fp16OptimizerHook(
-            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
-    elif distributed and 'type' not in cfg.optimizer_config:
-        optimizer_config = OptimizerHook(**cfg.optimizer_config)
-    else:
-        optimizer_config = cfg.optimizer_config
-
-    # register hooks
-    runner.register_training_hooks(
-        cfg.lr_config,
-        optimizer_config,
-        cfg.checkpoint_config,
-        cfg.log_config,
-        cfg.get('momentum_config', None),
-        custom_hooks_config=cfg.get('custom_hooks', None))
-
-    if distributed:
-        if isinstance(runner, EpochBasedRunner):
-            runner.register_hook(DistSamplerSeedHook())
-
-    # register eval hooks
-    if validate:
-        # Support batch_size > 1 in validation
-        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
-        if val_samples_per_gpu > 1:
-            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
-            cfg.data.val.pipeline = replace_ImageToTensor(
-                cfg.data.val.pipeline)
-        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
-        val_dataloader = build_mmdet_dataloader(
-            val_dataset,
-            samples_per_gpu=val_samples_per_gpu,
-            workers_per_gpu=cfg.data.workers_per_gpu,
-            dist=distributed,
-            shuffle=False)
-        eval_cfg = cfg.get('evaluation', {})
-        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
-        eval_hook = MMDET_DistEvalHook if distributed else MMDET_EvalHook
-        # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the
-        # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'.
-        runner.register_hook(
-            eval_hook(val_dataloader, **eval_cfg), priority='LOW')
-
-    resume_from = None
-    if cfg.resume_from is None and cfg.get('auto_resume'):
-        resume_from = find_latest_checkpoint(cfg.work_dir)
-
-    if resume_from is not None:
-        cfg.resume_from = resume_from
-
-    if cfg.resume_from:
-        runner.resume(cfg.resume_from)
-    elif cfg.load_from:
-        runner.load_checkpoint(cfg.load_from)
-    runner.run(data_loaders, cfg.workflow)
-
-
-def train_model(model,
-                dataset,
-                cfg,
-                distributed=False,
-                validate=False,
-                timestamp=None,
-                meta=None):
-    """A function wrapper for launching model training according to cfg.
-
-    Because we need different eval_hook in runner. Should be deprecated in the
-    future.
-    """
-    if cfg.model.type in ['EncoderDecoder3D']:
-        train_segmentor(
-            model,
-            dataset,
-            cfg,
-            distributed=distributed,
-            validate=validate,
-            timestamp=timestamp,
-            meta=meta)
-    else:
-        train_detector(
-            model,
-            dataset,
-            cfg,
-            distributed=distributed,
-            validate=validate,
-            timestamp=timestamp,
-            meta=meta)
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+import warnings
+
+import numpy as np
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+                         Fp16OptimizerHook, OptimizerHook, build_optimizer,
+                         build_runner, get_dist_info)
+from mmcv.utils import build_from_cfg
+from torch import distributed as dist
+
+from mmdet3d.datasets import build_dataset
+from mmdet3d.utils import find_latest_checkpoint
+from mmdet.core import DistEvalHook as MMDET_DistEvalHook
+from mmdet.core import EvalHook as MMDET_EvalHook
+from mmdet.datasets import build_dataloader as build_mmdet_dataloader
+from mmdet.datasets import replace_ImageToTensor
+from mmdet.utils import get_root_logger as get_mmdet_root_logger
+from mmseg.core import DistEvalHook as MMSEG_DistEvalHook
+from mmseg.core import EvalHook as MMSEG_EvalHook
+from mmseg.datasets import build_dataloader as build_mmseg_dataloader
+from mmseg.utils import get_root_logger as get_mmseg_root_logger
+
+
+def init_random_seed(seed=None, device='cuda'):
+    """Initialize random seed.
+
+    If the seed is not set, the seed will be automatically randomized,
+    and then broadcast to all processes to prevent some potential bugs.
+    Args:
+        seed (int, optional): The seed. Default to None.
+        device (str, optional): The device where the seed will be put on.
+            Default to 'cuda'.
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is not None:
+        return seed
+
+    # Make sure all ranks share the same random seed to prevent
+    # some potential bugs. Please refer to
+    # https://github.com/open-mmlab/mmdetection/issues/6339
+    rank, world_size = get_dist_info()
+    seed = np.random.randint(2**31)
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
+
+
+def set_random_seed(seed, deterministic=False):
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def train_segmentor(model,
+                    dataset,
+                    cfg,
+                    distributed=False,
+                    validate=False,
+                    timestamp=None,
+                    meta=None):
+    """Launch segmentor training."""
+    logger = get_mmseg_root_logger(cfg.log_level)
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    data_loaders = [
+        build_mmseg_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # cfg.gpus will be ignored if distributed
+            len(cfg.gpu_ids),
+            dist=distributed,
+            seed=cfg.seed,
+            drop_last=True) for ds in dataset
+    ]
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+    else:
+        model = MMDataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+
+    if cfg.get('runner') is None:
+        cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters}
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+
+    runner = build_runner(
+        cfg.runner,
+        default_args=dict(
+            model=model,
+            batch_processor=None,
+            optimizer=optimizer,
+            work_dir=cfg.work_dir,
+            logger=logger,
+            meta=meta))
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+
+    # an ugly walkaround to make the .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # register eval hooks
+    if validate:
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+        val_dataloader = build_mmseg_dataloader(
+            val_dataset,
+            samples_per_gpu=1,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False)
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_hook = MMSEG_DistEvalHook if distributed else MMSEG_EvalHook
+        # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the
+        # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'.
+        runner.register_hook(
+            eval_hook(val_dataloader, **eval_cfg), priority='LOW')
+
+    # user-defined hooks
+    if cfg.get('custom_hooks', None):
+        custom_hooks = cfg.custom_hooks
+        assert isinstance(custom_hooks, list), \
+            f'custom_hooks expect list type, but got {type(custom_hooks)}'
+        for hook_cfg in cfg.custom_hooks:
+            assert isinstance(hook_cfg, dict), \
+                'Each item in custom_hooks expects dict type, but got ' \
+                f'{type(hook_cfg)}'
+            hook_cfg = hook_cfg.copy()
+            priority = hook_cfg.pop('priority', 'NORMAL')
+            hook = build_from_cfg(hook_cfg, HOOKS)
+            runner.register_hook(hook, priority=priority)
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
+
+
+def train_detector(model,
+                   dataset,
+                   cfg,
+                   distributed=False,
+                   validate=False,
+                   timestamp=None,
+                   meta=None):
+    logger = get_mmdet_root_logger(log_level=cfg.log_level)
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    if 'imgs_per_gpu' in cfg.data:
+        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                       'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            logger.warning(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            logger.warning(
+                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+    runner_type = 'EpochBasedRunner' if 'runner' not in cfg else cfg.runner[
+        'type']
+    data_loaders = [
+        build_mmdet_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # `num_gpus` will be ignored if distributed
+            num_gpus=len(cfg.gpu_ids),
+            dist=distributed,
+            seed=cfg.seed,
+            runner_type=runner_type,
+            persistent_workers=cfg.data.get('persistent_workers', False))
+        for ds in dataset
+    ]
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+    else:
+        model = MMDataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+
+    if 'runner' not in cfg:
+        cfg.runner = {
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        }
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+
+    runner = build_runner(
+        cfg.runner,
+        default_args=dict(
+            model=model,
+            optimizer=optimizer,
+            work_dir=cfg.work_dir,
+            logger=logger,
+            meta=meta))
+
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+    elif distributed and 'type' not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(
+        cfg.lr_config,
+        optimizer_config,
+        cfg.checkpoint_config,
+        cfg.log_config,
+        cfg.get('momentum_config', None),
+        custom_hooks_config=cfg.get('custom_hooks', None))
+
+    if distributed:
+        if isinstance(runner, EpochBasedRunner):
+            runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        # Support batch_size > 1 in validation
+        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+        if val_samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.val.pipeline = replace_ImageToTensor(
+                cfg.data.val.pipeline)
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+        val_dataloader = build_mmdet_dataloader(
+            val_dataset,
+            samples_per_gpu=val_samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False)
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_hook = MMDET_DistEvalHook if distributed else MMDET_EvalHook
+        # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the
+        # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'.
+        runner.register_hook(
+            eval_hook(val_dataloader, **eval_cfg), priority='LOW')
+
+    resume_from = None
+    if cfg.resume_from is None and cfg.get('auto_resume'):
+        resume_from = find_latest_checkpoint(cfg.work_dir)
+
+    if resume_from is not None:
+        cfg.resume_from = resume_from
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        train_segmentor(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
+    else:
+        train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
diff --git a/mmdet3d/core/__init__.py b/mmdet3d/core/__init__.py
index ffb0c1a..326d555 100644
--- a/mmdet3d/core/__init__.py
+++ b/mmdet3d/core/__init__.py
@@ -1,9 +1,9 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .anchor import *  # noqa: F401, F403
-from .bbox import *  # noqa: F401, F403
-from .evaluation import *  # noqa: F401, F403
-from .points import *  # noqa: F401, F403
-from .post_processing import *  # noqa: F401, F403
-from .utils import *  # noqa: F401, F403
-from .visualizer import *  # noqa: F401, F403
-from .voxel import *  # noqa: F401, F403
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor import *  # noqa: F401, F403
+from .bbox import *  # noqa: F401, F403
+from .evaluation import *  # noqa: F401, F403
+from .points import *  # noqa: F401, F403
+from .post_processing import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
+from .visualizer import *  # noqa: F401, F403
+from .voxel import *  # noqa: F401, F403
diff --git a/mmdet3d/core/anchor/__init__.py b/mmdet3d/core/anchor/__init__.py
index 7a34bf5..638845f 100644
--- a/mmdet3d/core/anchor/__init__.py
+++ b/mmdet3d/core/anchor/__init__.py
@@ -1,10 +1,10 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.core.anchor import build_prior_generator
-from .anchor_3d_generator import (AlignedAnchor3DRangeGenerator,
-                                  AlignedAnchor3DRangeGeneratorPerCls,
-                                  Anchor3DRangeGenerator)
-
-__all__ = [
-    'AlignedAnchor3DRangeGenerator', 'Anchor3DRangeGenerator',
-    'build_prior_generator', 'AlignedAnchor3DRangeGeneratorPerCls'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.core.anchor import build_prior_generator
+from .anchor_3d_generator import (AlignedAnchor3DRangeGenerator,
+                                  AlignedAnchor3DRangeGeneratorPerCls,
+                                  Anchor3DRangeGenerator)
+
+__all__ = [
+    'AlignedAnchor3DRangeGenerator', 'Anchor3DRangeGenerator',
+    'build_prior_generator', 'AlignedAnchor3DRangeGeneratorPerCls'
+]
diff --git a/mmdet3d/core/anchor/anchor_3d_generator.py b/mmdet3d/core/anchor/anchor_3d_generator.py
index e8681b7..2c20070 100644
--- a/mmdet3d/core/anchor/anchor_3d_generator.py
+++ b/mmdet3d/core/anchor/anchor_3d_generator.py
@@ -1,419 +1,419 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
-import torch
-
-from mmdet.core.anchor import ANCHOR_GENERATORS
-
-
-@ANCHOR_GENERATORS.register_module()
-class Anchor3DRangeGenerator(object):
-    """3D Anchor Generator by range.
-
-    This anchor generator generates anchors by the given range in different
-    feature levels.
-    Due the convention in 3D detection, different anchor sizes are related to
-    different ranges for different categories. However we find this setting
-    does not effect the performance much in some datasets, e.g., nuScenes.
-
-    Args:
-        ranges (list[list[float]]): Ranges of different anchors.
-            The ranges are the same across different feature levels. But may
-            vary for different anchor sizes if size_per_range is True.
-        sizes (list[list[float]], optional): 3D sizes of anchors.
-            Defaults to [[3.9, 1.6, 1.56]].
-        scales (list[int], optional): Scales of anchors in different feature
-            levels. Defaults to [1].
-        rotations (list[float], optional): Rotations of anchors in a feature
-            grid. Defaults to [0, 1.5707963].
-        custom_values (tuple[float], optional): Customized values of that
-            anchor. For example, in nuScenes the anchors have velocities.
-            Defaults to ().
-        reshape_out (bool, optional): Whether to reshape the output into
-            (N x 4). Defaults to True.
-        size_per_range (bool, optional): Whether to use separate ranges for
-            different sizes. If size_per_range is True, the ranges should have
-            the same length as the sizes, if not, it will be duplicated.
-            Defaults to True.
-    """
-
-    def __init__(self,
-                 ranges,
-                 sizes=[[3.9, 1.6, 1.56]],
-                 scales=[1],
-                 rotations=[0, 1.5707963],
-                 custom_values=(),
-                 reshape_out=True,
-                 size_per_range=True):
-        assert mmcv.is_list_of(ranges, list)
-        if size_per_range:
-            if len(sizes) != len(ranges):
-                assert len(ranges) == 1
-                ranges = ranges * len(sizes)
-            assert len(ranges) == len(sizes)
-        else:
-            assert len(ranges) == 1
-        assert mmcv.is_list_of(sizes, list)
-        assert isinstance(scales, list)
-
-        self.sizes = sizes
-        self.scales = scales
-        self.ranges = ranges
-        self.rotations = rotations
-        self.custom_values = custom_values
-        self.cached_anchors = None
-        self.reshape_out = reshape_out
-        self.size_per_range = size_per_range
-
-    def __repr__(self):
-        s = self.__class__.__name__ + '('
-        s += f'anchor_range={self.ranges},\n'
-        s += f'scales={self.scales},\n'
-        s += f'sizes={self.sizes},\n'
-        s += f'rotations={self.rotations},\n'
-        s += f'reshape_out={self.reshape_out},\n'
-        s += f'size_per_range={self.size_per_range})'
-        return s
-
-    @property
-    def num_base_anchors(self):
-        """list[int]: Total number of base anchors in a feature grid."""
-        num_rot = len(self.rotations)
-        num_size = torch.tensor(self.sizes).reshape(-1, 3).size(0)
-        return num_rot * num_size
-
-    @property
-    def num_levels(self):
-        """int: Number of feature levels that the generator is applied to."""
-        return len(self.scales)
-
-    def grid_anchors(self, featmap_sizes, device='cuda'):
-        """Generate grid anchors in multiple feature levels.
-
-        Args:
-            featmap_sizes (list[tuple]): List of feature map sizes in
-                multiple feature levels.
-            device (str, optional): Device where the anchors will be put on.
-                Defaults to 'cuda'.
-
-        Returns:
-            list[torch.Tensor]: Anchors in multiple feature levels.
-                The sizes of each tensor should be [N, 4], where
-                N = width * height * num_base_anchors, width and height
-                are the sizes of the corresponding feature level,
-                num_base_anchors is the number of anchors for that level.
-        """
-        assert self.num_levels == len(featmap_sizes)
-        multi_level_anchors = []
-        for i in range(self.num_levels):
-            anchors = self.single_level_grid_anchors(
-                featmap_sizes[i], self.scales[i], device=device)
-            if self.reshape_out:
-                anchors = anchors.reshape(-1, anchors.size(-1))
-            multi_level_anchors.append(anchors)
-        return multi_level_anchors
-
-    def single_level_grid_anchors(self, featmap_size, scale, device='cuda'):
-        """Generate grid anchors of a single level feature map.
-
-        This function is usually called by method ``self.grid_anchors``.
-
-        Args:
-            featmap_size (tuple[int]): Size of the feature map.
-            scale (float): Scale factor of the anchors in the current level.
-            device (str, optional): Device the tensor will be put on.
-                Defaults to 'cuda'.
-
-        Returns:
-            torch.Tensor: Anchors in the overall feature map.
-        """
-        # We reimplement the anchor generator using torch in cuda
-        # torch: 0.6975 s for 1000 times
-        # numpy: 4.3345 s for 1000 times
-        # which is ~5 times faster than the numpy implementation
-        if not self.size_per_range:
-            return self.anchors_single_range(
-                featmap_size,
-                self.ranges[0],
-                scale,
-                self.sizes,
-                self.rotations,
-                device=device)
-
-        mr_anchors = []
-        for anchor_range, anchor_size in zip(self.ranges, self.sizes):
-            mr_anchors.append(
-                self.anchors_single_range(
-                    featmap_size,
-                    anchor_range,
-                    scale,
-                    anchor_size,
-                    self.rotations,
-                    device=device))
-        mr_anchors = torch.cat(mr_anchors, dim=-3)
-        return mr_anchors
-
-    def anchors_single_range(self,
-                             feature_size,
-                             anchor_range,
-                             scale=1,
-                             sizes=[[3.9, 1.6, 1.56]],
-                             rotations=[0, 1.5707963],
-                             device='cuda'):
-        """Generate anchors in a single range.
-
-        Args:
-            feature_size (list[float] | tuple[float]): Feature map size. It is
-                either a list of a tuple of [D, H, W](in order of z, y, and x).
-            anchor_range (torch.Tensor | list[float]): Range of anchors with
-                shape [6]. The order is consistent with that of anchors, i.e.,
-                (x_min, y_min, z_min, x_max, y_max, z_max).
-            scale (float | int, optional): The scale factor of anchors.
-                Defaults to 1.
-            sizes (list[list] | np.ndarray | torch.Tensor, optional):
-                Anchor size with shape [N, 3], in order of x, y, z.
-                Defaults to [[3.9, 1.6, 1.56]].
-            rotations (list[float] | np.ndarray | torch.Tensor, optional):
-                Rotations of anchors in a single feature grid.
-                Defaults to [0, 1.5707963].
-            device (str): Devices that the anchors will be put on.
-                Defaults to 'cuda'.
-
-        Returns:
-            torch.Tensor: Anchors with shape
-                [*feature_size, num_sizes, num_rots, 7].
-        """
-        if len(feature_size) == 2:
-            feature_size = [1, feature_size[0], feature_size[1]]
-        anchor_range = torch.tensor(anchor_range, device=device)
-        z_centers = torch.linspace(
-            anchor_range[2], anchor_range[5], feature_size[0], device=device)
-        y_centers = torch.linspace(
-            anchor_range[1], anchor_range[4], feature_size[1], device=device)
-        x_centers = torch.linspace(
-            anchor_range[0], anchor_range[3], feature_size[2], device=device)
-        sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
-        rotations = torch.tensor(rotations, device=device)
-
-        # torch.meshgrid default behavior is 'id', np's default is 'xy'
-        rets = torch.meshgrid(x_centers, y_centers, z_centers, rotations)
-        # torch.meshgrid returns a tuple rather than list
-        rets = list(rets)
-        tile_shape = [1] * 5
-        tile_shape[-2] = int(sizes.shape[0])
-        for i in range(len(rets)):
-            rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
-
-        sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
-        tile_size_shape = list(rets[0].shape)
-        tile_size_shape[3] = 1
-        sizes = sizes.repeat(tile_size_shape)
-        rets.insert(3, sizes)
-
-        ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
-        # [1, 200, 176, N, 2, 7] for kitti after permute
-
-        if len(self.custom_values) > 0:
-            custom_ndim = len(self.custom_values)
-            custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
-            # custom[:] = self.custom_values
-            ret = torch.cat([ret, custom], dim=-1)
-            # [1, 200, 176, N, 2, 9] for nus dataset after permute
-        return ret
-
-
-@ANCHOR_GENERATORS.register_module()
-class AlignedAnchor3DRangeGenerator(Anchor3DRangeGenerator):
-    """Aligned 3D Anchor Generator by range.
-
-    This anchor generator uses a different manner to generate the positions
-    of anchors' centers from :class:`Anchor3DRangeGenerator`.
-
-    Note:
-        The `align` means that the anchor's center is aligned with the voxel
-        grid, which is also the feature grid. The previous implementation of
-        :class:`Anchor3DRangeGenerator` does not generate the anchors' center
-        according to the voxel grid. Rather, it generates the center by
-        uniformly distributing the anchors inside the minimum and maximum
-        anchor ranges according to the feature map sizes.
-        However, this makes the anchors center does not match the feature grid.
-        The :class:`AlignedAnchor3DRangeGenerator` add + 1 when using the
-        feature map sizes to obtain the corners of the voxel grid. Then it
-        shifts the coordinates to the center of voxel grid and use the left
-        up corner to distribute anchors.
-
-    Args:
-        anchor_corner (bool, optional): Whether to align with the corner of the
-            voxel grid. By default it is False and the anchor's center will be
-            the same as the corresponding voxel's center, which is also the
-            center of the corresponding greature grid. Defaults to False.
-    """
-
-    def __init__(self, align_corner=False, **kwargs):
-        super(AlignedAnchor3DRangeGenerator, self).__init__(**kwargs)
-        self.align_corner = align_corner
-
-    def anchors_single_range(self,
-                             feature_size,
-                             anchor_range,
-                             scale,
-                             sizes=[[3.9, 1.6, 1.56]],
-                             rotations=[0, 1.5707963],
-                             device='cuda'):
-        """Generate anchors in a single range.
-
-        Args:
-            feature_size (list[float] | tuple[float]): Feature map size. It is
-                either a list of a tuple of [D, H, W](in order of z, y, and x).
-            anchor_range (torch.Tensor | list[float]): Range of anchors with
-                shape [6]. The order is consistent with that of anchors, i.e.,
-                (x_min, y_min, z_min, x_max, y_max, z_max).
-            scale (float | int): The scale factor of anchors.
-            sizes (list[list] | np.ndarray | torch.Tensor, optional):
-                Anchor size with shape [N, 3], in order of x, y, z.
-                Defaults to [[3.9, 1.6, 1.56]].
-            rotations (list[float] | np.ndarray | torch.Tensor, optional):
-                Rotations of anchors in a single feature grid.
-                Defaults to [0, 1.5707963].
-            device (str, optional): Devices that the anchors will be put on.
-                Defaults to 'cuda'.
-
-        Returns:
-            torch.Tensor: Anchors with shape
-                [*feature_size, num_sizes, num_rots, 7].
-        """
-        if len(feature_size) == 2:
-            feature_size = [1, feature_size[0], feature_size[1]]
-        anchor_range = torch.tensor(anchor_range, device=device)
-        z_centers = torch.linspace(
-            anchor_range[2],
-            anchor_range[5],
-            feature_size[0] + 1,
-            device=device)
-        y_centers = torch.linspace(
-            anchor_range[1],
-            anchor_range[4],
-            feature_size[1] + 1,
-            device=device)
-        x_centers = torch.linspace(
-            anchor_range[0],
-            anchor_range[3],
-            feature_size[2] + 1,
-            device=device)
-        sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
-        rotations = torch.tensor(rotations, device=device)
-
-        # shift the anchor center
-        if not self.align_corner:
-            z_shift = (z_centers[1] - z_centers[0]) / 2
-            y_shift = (y_centers[1] - y_centers[0]) / 2
-            x_shift = (x_centers[1] - x_centers[0]) / 2
-            z_centers += z_shift
-            y_centers += y_shift
-            x_centers += x_shift
-
-        # torch.meshgrid default behavior is 'id', np's default is 'xy'
-        rets = torch.meshgrid(x_centers[:feature_size[2]],
-                              y_centers[:feature_size[1]],
-                              z_centers[:feature_size[0]], rotations)
-
-        # torch.meshgrid returns a tuple rather than list
-        rets = list(rets)
-        tile_shape = [1] * 5
-        tile_shape[-2] = int(sizes.shape[0])
-        for i in range(len(rets)):
-            rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
-
-        sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
-        tile_size_shape = list(rets[0].shape)
-        tile_size_shape[3] = 1
-        sizes = sizes.repeat(tile_size_shape)
-        rets.insert(3, sizes)
-
-        ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
-
-        if len(self.custom_values) > 0:
-            custom_ndim = len(self.custom_values)
-            custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
-            # TODO: check the support of custom values
-            # custom[:] = self.custom_values
-            ret = torch.cat([ret, custom], dim=-1)
-        return ret
-
-
-@ANCHOR_GENERATORS.register_module()
-class AlignedAnchor3DRangeGeneratorPerCls(AlignedAnchor3DRangeGenerator):
-    """3D Anchor Generator by range for per class.
-
-    This anchor generator generates anchors by the given range for per class.
-    Note that feature maps of different classes may be different.
-
-    Args:
-        kwargs (dict): Arguments are the same as those in
-            :class:`AlignedAnchor3DRangeGenerator`.
-    """
-
-    def __init__(self, **kwargs):
-        super(AlignedAnchor3DRangeGeneratorPerCls, self).__init__(**kwargs)
-        assert len(self.scales) == 1, 'Multi-scale feature map levels are' + \
-            ' not supported currently in this kind of anchor generator.'
-
-    def grid_anchors(self, featmap_sizes, device='cuda'):
-        """Generate grid anchors in multiple feature levels.
-
-        Args:
-            featmap_sizes (list[tuple]): List of feature map sizes for
-                different classes in a single feature level.
-            device (str, optional): Device where the anchors will be put on.
-                Defaults to 'cuda'.
-
-        Returns:
-            list[list[torch.Tensor]]: Anchors in multiple feature levels.
-                Note that in this anchor generator, we currently only
-                support single feature level. The sizes of each tensor
-                should be [num_sizes/ranges*num_rots*featmap_size,
-                box_code_size].
-        """
-        multi_level_anchors = []
-        anchors = self.multi_cls_grid_anchors(
-            featmap_sizes, self.scales[0], device=device)
-        multi_level_anchors.append(anchors)
-        return multi_level_anchors
-
-    def multi_cls_grid_anchors(self, featmap_sizes, scale, device='cuda'):
-        """Generate grid anchors of a single level feature map for multi-class
-        with different feature map sizes.
-
-        This function is usually called by method ``self.grid_anchors``.
-
-        Args:
-            featmap_sizes (list[tuple]): List of feature map sizes for
-                different classes in a single feature level.
-            scale (float): Scale factor of the anchors in the current level.
-            device (str, optional): Device the tensor will be put on.
-                Defaults to 'cuda'.
-
-        Returns:
-            torch.Tensor: Anchors in the overall feature map.
-        """
-        assert len(featmap_sizes) == len(self.sizes) == len(self.ranges), \
-            'The number of different feature map sizes anchor sizes and ' + \
-            'ranges should be the same.'
-
-        multi_cls_anchors = []
-        for i in range(len(featmap_sizes)):
-            anchors = self.anchors_single_range(
-                featmap_sizes[i],
-                self.ranges[i],
-                scale,
-                self.sizes[i],
-                self.rotations,
-                device=device)
-            # [*featmap_size, num_sizes/ranges, num_rots, box_code_size]
-            ndim = len(featmap_sizes[i])
-            anchors = anchors.view(*featmap_sizes[i], -1, anchors.size(-1))
-            # [*featmap_size, num_sizes/ranges*num_rots, box_code_size]
-            anchors = anchors.permute(ndim, *range(0, ndim), ndim + 1)
-            # [num_sizes/ranges*num_rots, *featmap_size, box_code_size]
-            multi_cls_anchors.append(anchors.reshape(-1, anchors.size(-1)))
-            # [num_sizes/ranges*num_rots*featmap_size, box_code_size]
-        return multi_cls_anchors
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+
+from mmdet.core.anchor import ANCHOR_GENERATORS
+
+
+@ANCHOR_GENERATORS.register_module()
+class Anchor3DRangeGenerator(object):
+    """3D Anchor Generator by range.
+
+    This anchor generator generates anchors by the given range in different
+    feature levels.
+    Due the convention in 3D detection, different anchor sizes are related to
+    different ranges for different categories. However we find this setting
+    does not effect the performance much in some datasets, e.g., nuScenes.
+
+    Args:
+        ranges (list[list[float]]): Ranges of different anchors.
+            The ranges are the same across different feature levels. But may
+            vary for different anchor sizes if size_per_range is True.
+        sizes (list[list[float]], optional): 3D sizes of anchors.
+            Defaults to [[3.9, 1.6, 1.56]].
+        scales (list[int], optional): Scales of anchors in different feature
+            levels. Defaults to [1].
+        rotations (list[float], optional): Rotations of anchors in a feature
+            grid. Defaults to [0, 1.5707963].
+        custom_values (tuple[float], optional): Customized values of that
+            anchor. For example, in nuScenes the anchors have velocities.
+            Defaults to ().
+        reshape_out (bool, optional): Whether to reshape the output into
+            (N x 4). Defaults to True.
+        size_per_range (bool, optional): Whether to use separate ranges for
+            different sizes. If size_per_range is True, the ranges should have
+            the same length as the sizes, if not, it will be duplicated.
+            Defaults to True.
+    """
+
+    def __init__(self,
+                 ranges,
+                 sizes=[[3.9, 1.6, 1.56]],
+                 scales=[1],
+                 rotations=[0, 1.5707963],
+                 custom_values=(),
+                 reshape_out=True,
+                 size_per_range=True):
+        assert mmcv.is_list_of(ranges, list)
+        if size_per_range:
+            if len(sizes) != len(ranges):
+                assert len(ranges) == 1
+                ranges = ranges * len(sizes)
+            assert len(ranges) == len(sizes)
+        else:
+            assert len(ranges) == 1
+        assert mmcv.is_list_of(sizes, list)
+        assert isinstance(scales, list)
+
+        self.sizes = sizes
+        self.scales = scales
+        self.ranges = ranges
+        self.rotations = rotations
+        self.custom_values = custom_values
+        self.cached_anchors = None
+        self.reshape_out = reshape_out
+        self.size_per_range = size_per_range
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'anchor_range={self.ranges},\n'
+        s += f'scales={self.scales},\n'
+        s += f'sizes={self.sizes},\n'
+        s += f'rotations={self.rotations},\n'
+        s += f'reshape_out={self.reshape_out},\n'
+        s += f'size_per_range={self.size_per_range})'
+        return s
+
+    @property
+    def num_base_anchors(self):
+        """list[int]: Total number of base anchors in a feature grid."""
+        num_rot = len(self.rotations)
+        num_size = torch.tensor(self.sizes).reshape(-1, 3).size(0)
+        return num_rot * num_size
+
+    @property
+    def num_levels(self):
+        """int: Number of feature levels that the generator is applied to."""
+        return len(self.scales)
+
+    def grid_anchors(self, featmap_sizes, device='cuda'):
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            device (str, optional): Device where the anchors will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            list[torch.Tensor]: Anchors in multiple feature levels.
+                The sizes of each tensor should be [N, 4], where
+                N = width * height * num_base_anchors, width and height
+                are the sizes of the corresponding feature level,
+                num_base_anchors is the number of anchors for that level.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_anchors(
+                featmap_sizes[i], self.scales[i], device=device)
+            if self.reshape_out:
+                anchors = anchors.reshape(-1, anchors.size(-1))
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_anchors(self, featmap_size, scale, device='cuda'):
+        """Generate grid anchors of a single level feature map.
+
+        This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature map.
+            scale (float): Scale factor of the anchors in the current level.
+            device (str, optional): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature map.
+        """
+        # We reimplement the anchor generator using torch in cuda
+        # torch: 0.6975 s for 1000 times
+        # numpy: 4.3345 s for 1000 times
+        # which is ~5 times faster than the numpy implementation
+        if not self.size_per_range:
+            return self.anchors_single_range(
+                featmap_size,
+                self.ranges[0],
+                scale,
+                self.sizes,
+                self.rotations,
+                device=device)
+
+        mr_anchors = []
+        for anchor_range, anchor_size in zip(self.ranges, self.sizes):
+            mr_anchors.append(
+                self.anchors_single_range(
+                    featmap_size,
+                    anchor_range,
+                    scale,
+                    anchor_size,
+                    self.rotations,
+                    device=device))
+        mr_anchors = torch.cat(mr_anchors, dim=-3)
+        return mr_anchors
+
+    def anchors_single_range(self,
+                             feature_size,
+                             anchor_range,
+                             scale=1,
+                             sizes=[[3.9, 1.6, 1.56]],
+                             rotations=[0, 1.5707963],
+                             device='cuda'):
+        """Generate anchors in a single range.
+
+        Args:
+            feature_size (list[float] | tuple[float]): Feature map size. It is
+                either a list of a tuple of [D, H, W](in order of z, y, and x).
+            anchor_range (torch.Tensor | list[float]): Range of anchors with
+                shape [6]. The order is consistent with that of anchors, i.e.,
+                (x_min, y_min, z_min, x_max, y_max, z_max).
+            scale (float | int, optional): The scale factor of anchors.
+                Defaults to 1.
+            sizes (list[list] | np.ndarray | torch.Tensor, optional):
+                Anchor size with shape [N, 3], in order of x, y, z.
+                Defaults to [[3.9, 1.6, 1.56]].
+            rotations (list[float] | np.ndarray | torch.Tensor, optional):
+                Rotations of anchors in a single feature grid.
+                Defaults to [0, 1.5707963].
+            device (str): Devices that the anchors will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors with shape
+                [*feature_size, num_sizes, num_rots, 7].
+        """
+        if len(feature_size) == 2:
+            feature_size = [1, feature_size[0], feature_size[1]]
+        anchor_range = torch.tensor(anchor_range, device=device)
+        z_centers = torch.linspace(
+            anchor_range[2], anchor_range[5], feature_size[0], device=device)
+        y_centers = torch.linspace(
+            anchor_range[1], anchor_range[4], feature_size[1], device=device)
+        x_centers = torch.linspace(
+            anchor_range[0], anchor_range[3], feature_size[2], device=device)
+        sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
+        rotations = torch.tensor(rotations, device=device)
+
+        # torch.meshgrid default behavior is 'id', np's default is 'xy'
+        rets = torch.meshgrid(x_centers, y_centers, z_centers, rotations)
+        # torch.meshgrid returns a tuple rather than list
+        rets = list(rets)
+        tile_shape = [1] * 5
+        tile_shape[-2] = int(sizes.shape[0])
+        for i in range(len(rets)):
+            rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
+
+        sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
+        tile_size_shape = list(rets[0].shape)
+        tile_size_shape[3] = 1
+        sizes = sizes.repeat(tile_size_shape)
+        rets.insert(3, sizes)
+
+        ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
+        # [1, 200, 176, N, 2, 7] for kitti after permute
+
+        if len(self.custom_values) > 0:
+            custom_ndim = len(self.custom_values)
+            custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
+            # custom[:] = self.custom_values
+            ret = torch.cat([ret, custom], dim=-1)
+            # [1, 200, 176, N, 2, 9] for nus dataset after permute
+        return ret
+
+
+@ANCHOR_GENERATORS.register_module()
+class AlignedAnchor3DRangeGenerator(Anchor3DRangeGenerator):
+    """Aligned 3D Anchor Generator by range.
+
+    This anchor generator uses a different manner to generate the positions
+    of anchors' centers from :class:`Anchor3DRangeGenerator`.
+
+    Note:
+        The `align` means that the anchor's center is aligned with the voxel
+        grid, which is also the feature grid. The previous implementation of
+        :class:`Anchor3DRangeGenerator` does not generate the anchors' center
+        according to the voxel grid. Rather, it generates the center by
+        uniformly distributing the anchors inside the minimum and maximum
+        anchor ranges according to the feature map sizes.
+        However, this makes the anchors center does not match the feature grid.
+        The :class:`AlignedAnchor3DRangeGenerator` add + 1 when using the
+        feature map sizes to obtain the corners of the voxel grid. Then it
+        shifts the coordinates to the center of voxel grid and use the left
+        up corner to distribute anchors.
+
+    Args:
+        anchor_corner (bool, optional): Whether to align with the corner of the
+            voxel grid. By default it is False and the anchor's center will be
+            the same as the corresponding voxel's center, which is also the
+            center of the corresponding greature grid. Defaults to False.
+    """
+
+    def __init__(self, align_corner=False, **kwargs):
+        super(AlignedAnchor3DRangeGenerator, self).__init__(**kwargs)
+        self.align_corner = align_corner
+
+    def anchors_single_range(self,
+                             feature_size,
+                             anchor_range,
+                             scale,
+                             sizes=[[3.9, 1.6, 1.56]],
+                             rotations=[0, 1.5707963],
+                             device='cuda'):
+        """Generate anchors in a single range.
+
+        Args:
+            feature_size (list[float] | tuple[float]): Feature map size. It is
+                either a list of a tuple of [D, H, W](in order of z, y, and x).
+            anchor_range (torch.Tensor | list[float]): Range of anchors with
+                shape [6]. The order is consistent with that of anchors, i.e.,
+                (x_min, y_min, z_min, x_max, y_max, z_max).
+            scale (float | int): The scale factor of anchors.
+            sizes (list[list] | np.ndarray | torch.Tensor, optional):
+                Anchor size with shape [N, 3], in order of x, y, z.
+                Defaults to [[3.9, 1.6, 1.56]].
+            rotations (list[float] | np.ndarray | torch.Tensor, optional):
+                Rotations of anchors in a single feature grid.
+                Defaults to [0, 1.5707963].
+            device (str, optional): Devices that the anchors will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors with shape
+                [*feature_size, num_sizes, num_rots, 7].
+        """
+        if len(feature_size) == 2:
+            feature_size = [1, feature_size[0], feature_size[1]]
+        anchor_range = torch.tensor(anchor_range, device=device)
+        z_centers = torch.linspace(
+            anchor_range[2],
+            anchor_range[5],
+            feature_size[0] + 1,
+            device=device)
+        y_centers = torch.linspace(
+            anchor_range[1],
+            anchor_range[4],
+            feature_size[1] + 1,
+            device=device)
+        x_centers = torch.linspace(
+            anchor_range[0],
+            anchor_range[3],
+            feature_size[2] + 1,
+            device=device)
+        sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
+        rotations = torch.tensor(rotations, device=device)
+
+        # shift the anchor center
+        if not self.align_corner:
+            z_shift = (z_centers[1] - z_centers[0]) / 2
+            y_shift = (y_centers[1] - y_centers[0]) / 2
+            x_shift = (x_centers[1] - x_centers[0]) / 2
+            z_centers += z_shift
+            y_centers += y_shift
+            x_centers += x_shift
+
+        # torch.meshgrid default behavior is 'id', np's default is 'xy'
+        rets = torch.meshgrid(x_centers[:feature_size[2]],
+                              y_centers[:feature_size[1]],
+                              z_centers[:feature_size[0]], rotations)
+
+        # torch.meshgrid returns a tuple rather than list
+        rets = list(rets)
+        tile_shape = [1] * 5
+        tile_shape[-2] = int(sizes.shape[0])
+        for i in range(len(rets)):
+            rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
+
+        sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
+        tile_size_shape = list(rets[0].shape)
+        tile_size_shape[3] = 1
+        sizes = sizes.repeat(tile_size_shape)
+        rets.insert(3, sizes)
+
+        ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
+
+        if len(self.custom_values) > 0:
+            custom_ndim = len(self.custom_values)
+            custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
+            # TODO: check the support of custom values
+            # custom[:] = self.custom_values
+            ret = torch.cat([ret, custom], dim=-1)
+        return ret
+
+
+@ANCHOR_GENERATORS.register_module()
+class AlignedAnchor3DRangeGeneratorPerCls(AlignedAnchor3DRangeGenerator):
+    """3D Anchor Generator by range for per class.
+
+    This anchor generator generates anchors by the given range for per class.
+    Note that feature maps of different classes may be different.
+
+    Args:
+        kwargs (dict): Arguments are the same as those in
+            :class:`AlignedAnchor3DRangeGenerator`.
+    """
+
+    def __init__(self, **kwargs):
+        super(AlignedAnchor3DRangeGeneratorPerCls, self).__init__(**kwargs)
+        assert len(self.scales) == 1, 'Multi-scale feature map levels are' + \
+            ' not supported currently in this kind of anchor generator.'
+
+    def grid_anchors(self, featmap_sizes, device='cuda'):
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes for
+                different classes in a single feature level.
+            device (str, optional): Device where the anchors will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            list[list[torch.Tensor]]: Anchors in multiple feature levels.
+                Note that in this anchor generator, we currently only
+                support single feature level. The sizes of each tensor
+                should be [num_sizes/ranges*num_rots*featmap_size,
+                box_code_size].
+        """
+        multi_level_anchors = []
+        anchors = self.multi_cls_grid_anchors(
+            featmap_sizes, self.scales[0], device=device)
+        multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def multi_cls_grid_anchors(self, featmap_sizes, scale, device='cuda'):
+        """Generate grid anchors of a single level feature map for multi-class
+        with different feature map sizes.
+
+        This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes for
+                different classes in a single feature level.
+            scale (float): Scale factor of the anchors in the current level.
+            device (str, optional): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature map.
+        """
+        assert len(featmap_sizes) == len(self.sizes) == len(self.ranges), \
+            'The number of different feature map sizes anchor sizes and ' + \
+            'ranges should be the same.'
+
+        multi_cls_anchors = []
+        for i in range(len(featmap_sizes)):
+            anchors = self.anchors_single_range(
+                featmap_sizes[i],
+                self.ranges[i],
+                scale,
+                self.sizes[i],
+                self.rotations,
+                device=device)
+            # [*featmap_size, num_sizes/ranges, num_rots, box_code_size]
+            ndim = len(featmap_sizes[i])
+            anchors = anchors.view(*featmap_sizes[i], -1, anchors.size(-1))
+            # [*featmap_size, num_sizes/ranges*num_rots, box_code_size]
+            anchors = anchors.permute(ndim, *range(0, ndim), ndim + 1)
+            # [num_sizes/ranges*num_rots, *featmap_size, box_code_size]
+            multi_cls_anchors.append(anchors.reshape(-1, anchors.size(-1)))
+            # [num_sizes/ranges*num_rots*featmap_size, box_code_size]
+        return multi_cls_anchors
diff --git a/mmdet3d/core/bbox/__init__.py b/mmdet3d/core/bbox/__init__.py
index 8c66630..4e26968 100644
--- a/mmdet3d/core/bbox/__init__.py
+++ b/mmdet3d/core/bbox/__init__.py
@@ -1,30 +1,30 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .assigners import AssignResult, BaseAssigner, MaxIoUAssigner
-from .coders import DeltaXYZWLHRBBoxCoder
-# from .bbox_target import bbox_target
-from .iou_calculators import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
-                              BboxOverlapsNearest3D,
-                              axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
-                              bbox_overlaps_nearest_3d)
-from .samplers import (BaseSampler, CombinedSampler,
-                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
-                       PseudoSampler, RandomSampler, SamplingResult)
-from .structures import (BaseInstance3DBoxes, Box3DMode, CameraInstance3DBoxes,
-                         Coord3DMode, DepthInstance3DBoxes,
-                         LiDARInstance3DBoxes, get_box_type, limit_period,
-                         mono_cam_box2vis, points_cam2img, points_img2cam,
-                         xywhr2xyxyr)
-from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back
-
-__all__ = [
-    'BaseSampler', 'AssignResult', 'BaseAssigner', 'MaxIoUAssigner',
-    'PseudoSampler', 'RandomSampler', 'InstanceBalancedPosSampler',
-    'IoUBalancedNegSampler', 'CombinedSampler', 'SamplingResult',
-    'DeltaXYZWLHRBBoxCoder', 'BboxOverlapsNearest3D', 'BboxOverlaps3D',
-    'bbox_overlaps_nearest_3d', 'bbox_overlaps_3d',
-    'AxisAlignedBboxOverlaps3D', 'axis_aligned_bbox_overlaps_3d', 'Box3DMode',
-    'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'bbox3d2roi',
-    'bbox3d2result', 'DepthInstance3DBoxes', 'BaseInstance3DBoxes',
-    'bbox3d_mapping_back', 'xywhr2xyxyr', 'limit_period', 'points_cam2img',
-    'points_img2cam', 'get_box_type', 'Coord3DMode', 'mono_cam_box2vis'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .assigners import AssignResult, BaseAssigner, MaxIoUAssigner
+from .coders import DeltaXYZWLHRBBoxCoder
+# from .bbox_target import bbox_target
+from .iou_calculators import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
+                              BboxOverlapsNearest3D,
+                              axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
+                              bbox_overlaps_nearest_3d)
+from .samplers import (BaseSampler, CombinedSampler,
+                       InstanceBalancedPosSampler, IoUBalancedNegSampler,
+                       PseudoSampler, RandomSampler, SamplingResult)
+from .structures import (BaseInstance3DBoxes, Box3DMode, CameraInstance3DBoxes,
+                         Coord3DMode, DepthInstance3DBoxes,
+                         LiDARInstance3DBoxes, get_box_type, limit_period,
+                         mono_cam_box2vis, points_cam2img, points_img2cam,
+                         xywhr2xyxyr)
+from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back
+
+__all__ = [
+    'BaseSampler', 'AssignResult', 'BaseAssigner', 'MaxIoUAssigner',
+    'PseudoSampler', 'RandomSampler', 'InstanceBalancedPosSampler',
+    'IoUBalancedNegSampler', 'CombinedSampler', 'SamplingResult',
+    'DeltaXYZWLHRBBoxCoder', 'BboxOverlapsNearest3D', 'BboxOverlaps3D',
+    'bbox_overlaps_nearest_3d', 'bbox_overlaps_3d',
+    'AxisAlignedBboxOverlaps3D', 'axis_aligned_bbox_overlaps_3d', 'Box3DMode',
+    'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'bbox3d2roi',
+    'bbox3d2result', 'DepthInstance3DBoxes', 'BaseInstance3DBoxes',
+    'bbox3d_mapping_back', 'xywhr2xyxyr', 'limit_period', 'points_cam2img',
+    'points_img2cam', 'get_box_type', 'Coord3DMode', 'mono_cam_box2vis'
+]
diff --git a/mmdet3d/core/bbox/assigners/__init__.py b/mmdet3d/core/bbox/assigners/__init__.py
index d149368..7fcd31f 100644
--- a/mmdet3d/core/bbox/assigners/__init__.py
+++ b/mmdet3d/core/bbox/assigners/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.core.bbox import AssignResult, BaseAssigner, MaxIoUAssigner
-
-__all__ = ['BaseAssigner', 'MaxIoUAssigner', 'AssignResult']
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.core.bbox import AssignResult, BaseAssigner, MaxIoUAssigner
+
+__all__ = ['BaseAssigner', 'MaxIoUAssigner', 'AssignResult']
diff --git a/mmdet3d/core/bbox/box_np_ops.py b/mmdet3d/core/bbox/box_np_ops.py
index bb52bbb..c33ce51 100644
--- a/mmdet3d/core/bbox/box_np_ops.py
+++ b/mmdet3d/core/bbox/box_np_ops.py
@@ -1,827 +1,827 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# TODO: clean the functions in this file and move the APIs into box structures
-# in the future
-# NOTICE: All functions in this file are valid for LiDAR or depth boxes only
-# if we use default parameters.
-
-import numba
-import numpy as np
-
-from .structures.utils import limit_period, points_cam2img, rotation_3d_in_axis
-
-
-def camera_to_lidar(points, r_rect, velo2cam):
-    """Convert points in camera coordinate to lidar coordinate.
-
-    Note:
-        This function is for KITTI only.
-
-    Args:
-        points (np.ndarray, shape=[N, 3]): Points in camera coordinate.
-        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
-            specific camera coordinate (e.g. CAM2) to CAM0.
-        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
-            camera coordinate to lidar coordinate.
-
-    Returns:
-        np.ndarray, shape=[N, 3]: Points in lidar coordinate.
-    """
-    points_shape = list(points.shape[0:-1])
-    if points.shape[-1] == 3:
-        points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1)
-    lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T)
-    return lidar_points[..., :3]
-
-
-def box_camera_to_lidar(data, r_rect, velo2cam):
-    """Convert boxes in camera coordinate to lidar coordinate.
-
-    Note:
-        This function is for KITTI only.
-
-    Args:
-        data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
-        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
-            specific camera coordinate (e.g. CAM2) to CAM0.
-        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
-            camera coordinate to lidar coordinate.
-
-    Returns:
-        np.ndarray, shape=[N, 3]: Boxes in lidar coordinate.
-    """
-    xyz = data[:, 0:3]
-    x_size, y_size, z_size = data[:, 3:4], data[:, 4:5], data[:, 5:6]
-    r = data[:, 6:7]
-    xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam)
-    # yaw and dims also needs to be converted
-    r_new = -r - np.pi / 2
-    r_new = limit_period(r_new, period=np.pi * 2)
-    return np.concatenate([xyz_lidar, x_size, z_size, y_size, r_new], axis=1)
-
-
-def corners_nd(dims, origin=0.5):
-    """Generate relative box corners based on length per dim and origin point.
-
-    Args:
-        dims (np.ndarray, shape=[N, ndim]): Array of length per dim
-        origin (list or array or float, optional): origin point relate to
-            smallest point. Defaults to 0.5
-
-    Returns:
-        np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners.
-        point layout example: (2d) x0y0, x0y1, x1y0, x1y1;
-            (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
-            where x0 < x1, y0 < y1, z0 < z1.
-    """
-    ndim = int(dims.shape[1])
-    corners_norm = np.stack(
-        np.unravel_index(np.arange(2**ndim), [2] * ndim),
-        axis=1).astype(dims.dtype)
-    # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1
-    # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
-    # so need to convert to a format which is convenient to do other computing.
-    # for 2d boxes, format is clockwise start with minimum point
-    # for 3d boxes, please draw lines by your hand.
-    if ndim == 2:
-        # generate clockwise box corners
-        corners_norm = corners_norm[[0, 1, 3, 2]]
-    elif ndim == 3:
-        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
-    corners_norm = corners_norm - np.array(origin, dtype=dims.dtype)
-    corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape(
-        [1, 2**ndim, ndim])
-    return corners
-
-
-def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
-    """Convert kitti locations, dimensions and angles to corners.
-    format: center(xy), dims(xy), angles(counterclockwise when positive)
-
-    Args:
-        centers (np.ndarray): Locations in kitti label file with shape (N, 2).
-        dims (np.ndarray): Dimensions in kitti label file with shape (N, 2).
-        angles (np.ndarray, optional): Rotation_y in kitti label file with
-            shape (N). Defaults to None.
-        origin (list or array or float, optional): origin point relate to
-            smallest point. Defaults to 0.5.
-
-    Returns:
-        np.ndarray: Corners with the shape of (N, 4, 2).
-    """
-    # 'length' in kitti format is in x axis.
-    # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
-    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
-    corners = corners_nd(dims, origin=origin)
-    # corners: [N, 4, 2]
-    if angles is not None:
-        corners = rotation_3d_in_axis(corners, angles)
-    corners += centers.reshape([-1, 1, 2])
-    return corners
-
-
-@numba.jit(nopython=True)
-def depth_to_points(depth, trunc_pixel):
-    """Convert depth map to points.
-
-    Args:
-        depth (np.array, shape=[H, W]): Depth map which
-            the row of [0~`trunc_pixel`] are truncated.
-        trunc_pixel (int): The number of truncated row.
-
-    Returns:
-        np.ndarray: Points in camera coordinates.
-    """
-    num_pts = np.sum(depth[trunc_pixel:, ] > 0.1)
-    points = np.zeros((num_pts, 3), dtype=depth.dtype)
-    x = np.array([0, 0, 1], dtype=depth.dtype)
-    k = 0
-    for i in range(trunc_pixel, depth.shape[0]):
-        for j in range(depth.shape[1]):
-            if depth[i, j] > 0.1:
-                x = np.array([j, i, 1], dtype=depth.dtype)
-                points[k] = x * depth[i, j]
-                k += 1
-    return points
-
-
-def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam):
-    """Convert depth map to points in lidar coordinate.
-
-    Args:
-        depth (np.array, shape=[H, W]): Depth map which
-            the row of [0~`trunc_pixel`] are truncated.
-        trunc_pixel (int): The number of truncated row.
-        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
-        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
-            specific camera coordinate (e.g. CAM2) to CAM0.
-        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
-            camera coordinate to lidar coordinate.
-
-    Returns:
-        np.ndarray: Points in lidar coordinates.
-    """
-    pts = depth_to_points(depth, trunc_pixel)
-    points_shape = list(pts.shape[0:-1])
-    points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1)
-    points = points @ np.linalg.inv(P2.T)
-    lidar_points = camera_to_lidar(points, r_rect, velo2cam)
-    return lidar_points
-
-
-def center_to_corner_box3d(centers,
-                           dims,
-                           angles=None,
-                           origin=(0.5, 1.0, 0.5),
-                           axis=1):
-    """Convert kitti locations, dimensions and angles to corners.
-
-    Args:
-        centers (np.ndarray): Locations in kitti label file with shape (N, 3).
-        dims (np.ndarray): Dimensions in kitti label file with shape (N, 3).
-        angles (np.ndarray, optional): Rotation_y in kitti label file with
-            shape (N). Defaults to None.
-        origin (list or array or float, optional): Origin point relate to
-            smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0)
-            in lidar. Defaults to (0.5, 1.0, 0.5).
-        axis (int, optional): Rotation axis. 1 for camera and 2 for lidar.
-            Defaults to 1.
-
-    Returns:
-        np.ndarray: Corners with the shape of (N, 8, 3).
-    """
-    # 'length' in kitti format is in x axis.
-    # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(lwh)(lidar)
-    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
-    corners = corners_nd(dims, origin=origin)
-    # corners: [N, 8, 3]
-    if angles is not None:
-        corners = rotation_3d_in_axis(corners, angles, axis=axis)
-    corners += centers.reshape([-1, 1, 3])
-    return corners
-
-
-@numba.jit(nopython=True)
-def box2d_to_corner_jit(boxes):
-    """Convert box2d to corner.
-
-    Args:
-        boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation.
-
-    Returns:
-        box_corners (np.ndarray, shape=[N, 4, 2]): Box corners.
-    """
-    num_box = boxes.shape[0]
-    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
-    corners_norm[1, 1] = 1.0
-    corners_norm[2] = 1.0
-    corners_norm[3, 0] = 1.0
-    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
-    corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(
-        1, 4, 2)
-    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
-    box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype)
-    for i in range(num_box):
-        rot_sin = np.sin(boxes[i, -1])
-        rot_cos = np.cos(boxes[i, -1])
-        rot_mat_T[0, 0] = rot_cos
-        rot_mat_T[0, 1] = rot_sin
-        rot_mat_T[1, 0] = -rot_sin
-        rot_mat_T[1, 1] = rot_cos
-        box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2]
-    return box_corners
-
-
-@numba.njit
-def corner_to_standup_nd_jit(boxes_corner):
-    """Convert boxes_corner to aligned (min-max) boxes.
-
-    Args:
-        boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners.
-
-    Returns:
-        np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes.
-    """
-    num_boxes = boxes_corner.shape[0]
-    ndim = boxes_corner.shape[-1]
-    result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype)
-    for i in range(num_boxes):
-        for j in range(ndim):
-            result[i, j] = np.min(boxes_corner[i, :, j])
-        for j in range(ndim):
-            result[i, j + ndim] = np.max(boxes_corner[i, :, j])
-    return result
-
-
-@numba.jit(nopython=True)
-def corner_to_surfaces_3d_jit(corners):
-    """Convert 3d box corners from corner function above to surfaces that
-    normal vectors all direct to internal.
-
-    Args:
-        corners (np.ndarray): 3d box corners with the shape of (N, 8, 3).
-
-    Returns:
-        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
-    """
-    # box_corners: [N, 8, 3], must from corner functions in this module
-    num_boxes = corners.shape[0]
-    surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype)
-    corner_idxes = np.array([
-        0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7
-    ]).reshape(6, 4)
-    for i in range(num_boxes):
-        for j in range(6):
-            for k in range(4):
-                surfaces[i, j, k] = corners[i, corner_idxes[j, k]]
-    return surfaces
-
-
-def rotation_points_single_angle(points, angle, axis=0):
-    """Rotate points with a single angle.
-
-    Args:
-        points (np.ndarray, shape=[N, 3]]):
-        angle (np.ndarray, shape=[1]]):
-        axis (int, optional): Axis to rotate at. Defaults to 0.
-
-    Returns:
-        np.ndarray: Rotated points.
-    """
-    # points: [N, 3]
-    rot_sin = np.sin(angle)
-    rot_cos = np.cos(angle)
-    if axis == 1:
-        rot_mat_T = np.array(
-            [[rot_cos, 0, rot_sin], [0, 1, 0], [-rot_sin, 0, rot_cos]],
-            dtype=points.dtype)
-    elif axis == 2 or axis == -1:
-        rot_mat_T = np.array(
-            [[rot_cos, rot_sin, 0], [-rot_sin, rot_cos, 0], [0, 0, 1]],
-            dtype=points.dtype)
-    elif axis == 0:
-        rot_mat_T = np.array(
-            [[1, 0, 0], [0, rot_cos, rot_sin], [0, -rot_sin, rot_cos]],
-            dtype=points.dtype)
-    else:
-        raise ValueError('axis should in range')
-
-    return points @ rot_mat_T, rot_mat_T
-
-
-def box3d_to_bbox(box3d, P2):
-    """Convert box3d in camera coordinates to bbox in image coordinates.
-
-    Args:
-        box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
-        P2 (np.array, shape=[4, 4]): Intrinsics of Camera2.
-
-    Returns:
-        np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates.
-    """
-    box_corners = center_to_corner_box3d(
-        box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1)
-    box_corners_in_image = points_cam2img(box_corners, P2)
-    # box_corners_in_image: [N, 8, 2]
-    minxy = np.min(box_corners_in_image, axis=1)
-    maxxy = np.max(box_corners_in_image, axis=1)
-    bbox = np.concatenate([minxy, maxxy], axis=1)
-    return bbox
-
-
-def corner_to_surfaces_3d(corners):
-    """convert 3d box corners from corner function above to surfaces that
-    normal vectors all direct to internal.
-
-    Args:
-        corners (np.ndarray): 3D box corners with shape of (N, 8, 3).
-
-    Returns:
-        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
-    """
-    # box_corners: [N, 8, 3], must from corner functions in this module
-    surfaces = np.array([
-        [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]],
-        [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]],
-        [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]],
-        [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]],
-        [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]],
-        [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]],
-    ]).transpose([2, 0, 1, 3])
-    return surfaces
-
-
-def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)):
-    """Check points in rotated bbox and return indices.
-
-    Note:
-        This function is for counterclockwise boxes.
-
-    Args:
-        points (np.ndarray, shape=[N, 3+dim]): Points to query.
-        rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation.
-        z_axis (int, optional): Indicate which axis is height.
-            Defaults to 2.
-        origin (tuple[int], optional): Indicate the position of
-            box center. Defaults to (0.5, 0.5, 0).
-
-    Returns:
-        np.ndarray, shape=[N, M]: Indices of points in each box.
-    """
-    # TODO: this function is different from PointCloud3D, be careful
-    # when start to use nuscene, check the input
-    rbbox_corners = center_to_corner_box3d(
-        rbbox[:, :3], rbbox[:, 3:6], rbbox[:, 6], origin=origin, axis=z_axis)
-    surfaces = corner_to_surfaces_3d(rbbox_corners)
-    indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces)
-    return indices
-
-
-def minmax_to_corner_2d(minmax_box):
-    """Convert minmax box to corners2d.
-
-    Args:
-        minmax_box (np.ndarray, shape=[N, dims]): minmax boxes.
-
-    Returns:
-        np.ndarray: 2d corners of boxes
-    """
-    ndim = minmax_box.shape[-1] // 2
-    center = minmax_box[..., :ndim]
-    dims = minmax_box[..., ndim:] - center
-    return center_to_corner_box2d(center, dims, origin=0.0)
-
-
-def create_anchors_3d_range(feature_size,
-                            anchor_range,
-                            sizes=((3.9, 1.6, 1.56), ),
-                            rotations=(0, np.pi / 2),
-                            dtype=np.float32):
-    """Create anchors 3d by range.
-
-    Args:
-        feature_size (list[float] | tuple[float]): Feature map size. It is
-            either a list of a tuple of [D, H, W](in order of z, y, and x).
-        anchor_range (torch.Tensor | list[float]): Range of anchors with
-            shape [6]. The order is consistent with that of anchors, i.e.,
-            (x_min, y_min, z_min, x_max, y_max, z_max).
-        sizes (list[list] | np.ndarray | torch.Tensor, optional):
-            Anchor size with shape [N, 3], in order of x, y, z.
-            Defaults to ((3.9, 1.6, 1.56), ).
-        rotations (list[float] | np.ndarray | torch.Tensor, optional):
-            Rotations of anchors in a single feature grid.
-            Defaults to (0, np.pi / 2).
-        dtype (type, optional): Data type. Defaults to np.float32.
-
-    Returns:
-        np.ndarray: Range based anchors with shape of
-            (*feature_size, num_sizes, num_rots, 7).
-    """
-    anchor_range = np.array(anchor_range, dtype)
-    z_centers = np.linspace(
-        anchor_range[2], anchor_range[5], feature_size[0], dtype=dtype)
-    y_centers = np.linspace(
-        anchor_range[1], anchor_range[4], feature_size[1], dtype=dtype)
-    x_centers = np.linspace(
-        anchor_range[0], anchor_range[3], feature_size[2], dtype=dtype)
-    sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3])
-    rotations = np.array(rotations, dtype=dtype)
-    rets = np.meshgrid(
-        x_centers, y_centers, z_centers, rotations, indexing='ij')
-    tile_shape = [1] * 5
-    tile_shape[-2] = int(sizes.shape[0])
-    for i in range(len(rets)):
-        rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
-        rets[i] = rets[i][..., np.newaxis]  # for concat
-    sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
-    tile_size_shape = list(rets[0].shape)
-    tile_size_shape[3] = 1
-    sizes = np.tile(sizes, tile_size_shape)
-    rets.insert(3, sizes)
-    ret = np.concatenate(rets, axis=-1)
-    return np.transpose(ret, [2, 1, 0, 3, 4, 5])
-
-
-def center_to_minmax_2d(centers, dims, origin=0.5):
-    """Center to minmax.
-
-    Args:
-        centers (np.ndarray): Center points.
-        dims (np.ndarray): Dimensions.
-        origin (list or array or float, optional): Origin point relate
-            to smallest point. Defaults to 0.5.
-
-    Returns:
-        np.ndarray: Minmax points.
-    """
-    if origin == 0.5:
-        return np.concatenate([centers - dims / 2, centers + dims / 2],
-                              axis=-1)
-    corners = center_to_corner_box2d(centers, dims, origin=origin)
-    return corners[:, [0, 2]].reshape([-1, 4])
-
-
-def rbbox2d_to_near_bbox(rbboxes):
-    """convert rotated bbox to nearest 'standing' or 'lying' bbox.
-
-    Args:
-        rbboxes (np.ndarray): Rotated bboxes with shape of
-            (N, 5(x, y, xdim, ydim, rad)).
-
-    Returns:
-        np.ndarray: Bounding boxes with the shape of
-            (N, 4(xmin, ymin, xmax, ymax)).
-    """
-    rots = rbboxes[..., -1]
-    rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi))
-    cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis]
-    bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4])
-    bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:])
-    return bboxes
-
-
-@numba.jit(nopython=True)
-def iou_jit(boxes, query_boxes, mode='iou', eps=0.0):
-    """Calculate box iou. Note that jit version runs ~10x faster than the
-    box_overlaps function in mmdet3d.core.evaluation.
-
-    Note:
-        This function is for counterclockwise boxes.
-
-    Args:
-        boxes (np.ndarray): Input bounding boxes with shape of (N, 4).
-        query_boxes (np.ndarray): Query boxes with shape of (K, 4).
-        mode (str, optional): IoU mode. Defaults to 'iou'.
-        eps (float, optional): Value added to denominator. Defaults to 0.
-
-    Returns:
-        np.ndarray: Overlap between boxes and query_boxes
-            with the shape of [N, K].
-    """
-    N = boxes.shape[0]
-    K = query_boxes.shape[0]
-    overlaps = np.zeros((N, K), dtype=boxes.dtype)
-    for k in range(K):
-        box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) *
-                    (query_boxes[k, 3] - query_boxes[k, 1] + eps))
-        for n in range(N):
-            iw = (
-                min(boxes[n, 2], query_boxes[k, 2]) -
-                max(boxes[n, 0], query_boxes[k, 0]) + eps)
-            if iw > 0:
-                ih = (
-                    min(boxes[n, 3], query_boxes[k, 3]) -
-                    max(boxes[n, 1], query_boxes[k, 1]) + eps)
-                if ih > 0:
-                    if mode == 'iou':
-                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
-                              (boxes[n, 3] - boxes[n, 1] + eps) + box_area -
-                              iw * ih)
-                    else:
-                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
-                              (boxes[n, 3] - boxes[n, 1] + eps))
-                    overlaps[n, k] = iw * ih / ua
-    return overlaps
-
-
-def projection_matrix_to_CRT_kitti(proj):
-    """Split projection matrix of KITTI.
-
-    Note:
-        This function is for KITTI only.
-
-    P = C @ [R|T]
-    C is upper triangular matrix, so we need to inverse CR and use QR
-    stable for all kitti camera projection matrix.
-
-    Args:
-        proj (p.array, shape=[4, 4]): Intrinsics of camera.
-
-    Returns:
-        tuple[np.ndarray]: Splited matrix of C, R and T.
-    """
-
-    CR = proj[0:3, 0:3]
-    CT = proj[0:3, 3]
-    RinvCinv = np.linalg.inv(CR)
-    Rinv, Cinv = np.linalg.qr(RinvCinv)
-    C = np.linalg.inv(Cinv)
-    R = np.linalg.inv(Rinv)
-    T = Cinv @ CT
-    return C, R, T
-
-
-def remove_outside_points(points, rect, Trv2c, P2, image_shape):
-    """Remove points which are outside of image.
-
-    Note:
-        This function is for KITTI only.
-
-    Args:
-        points (np.ndarray, shape=[N, 3+dims]): Total points.
-        rect (np.ndarray, shape=[4, 4]): Matrix to project points in
-            specific camera coordinate (e.g. CAM2) to CAM0.
-        Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in
-            camera coordinate to lidar coordinate.
-        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
-        image_shape (list[int]): Shape of image.
-
-    Returns:
-        np.ndarray, shape=[N, 3+dims]: Filtered points.
-    """
-    # 5x faster than remove_outside_points_v1(2ms vs 10ms)
-    C, R, T = projection_matrix_to_CRT_kitti(P2)
-    image_bbox = [0, 0, image_shape[1], image_shape[0]]
-    frustum = get_frustum(image_bbox, C)
-    frustum -= T
-    frustum = np.linalg.inv(R) @ frustum.T
-    frustum = camera_to_lidar(frustum.T, rect, Trv2c)
-    frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...])
-    indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces)
-    points = points[indices.reshape([-1])]
-    return points
-
-
-def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
-    """Get frustum corners in camera coordinates.
-
-    Args:
-        bbox_image (list[int]): box in image coordinates.
-        C (np.ndarray): Intrinsics.
-        near_clip (float, optional): Nearest distance of frustum.
-            Defaults to 0.001.
-        far_clip (float, optional): Farthest distance of frustum.
-            Defaults to 100.
-
-    Returns:
-        np.ndarray, shape=[8, 3]: coordinates of frustum corners.
-    """
-    fku = C[0, 0]
-    fkv = -C[1, 1]
-    u0v0 = C[0:2, 2]
-    z_points = np.array(
-        [near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis]
-    b = bbox_image
-    box_corners = np.array(
-        [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]],
-        dtype=C.dtype)
-    near_box_corners = (box_corners - u0v0) / np.array(
-        [fku / near_clip, -fkv / near_clip], dtype=C.dtype)
-    far_box_corners = (box_corners - u0v0) / np.array(
-        [fku / far_clip, -fkv / far_clip], dtype=C.dtype)
-    ret_xy = np.concatenate([near_box_corners, far_box_corners],
-                            axis=0)  # [8, 2]
-    ret_xyz = np.concatenate([ret_xy, z_points], axis=1)
-    return ret_xyz
-
-
-def surface_equ_3d(polygon_surfaces):
-    """
-
-    Args:
-        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
-            [num_polygon, max_num_surfaces, max_num_points_of_surface, 3].
-            All surfaces' normal vector must direct to internal.
-            Max_num_points_of_surface must at least 3.
-
-    Returns:
-        tuple: normal vector and its direction.
-    """
-    # return [a, b, c], d in ax+by+cz+d=0
-    # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
-    surface_vec = polygon_surfaces[:, :, :2, :] - \
-        polygon_surfaces[:, :, 1:3, :]
-    # normal_vec: [..., 3]
-    normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
-    # print(normal_vec.shape, points[..., 0, :].shape)
-    # d = -np.inner(normal_vec, points[..., 0, :])
-    d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :])
-    return normal_vec, -d
-
-
-@numba.njit
-def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d,
-                                     num_surfaces):
-    """
-    Args:
-        points (np.ndarray): Input points with shape of (num_points, 3).
-        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
-            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
-            All surfaces' normal vector must direct to internal.
-            Max_num_points_of_surface must at least 3.
-        normal_vec (np.ndarray): Normal vector of polygon_surfaces.
-        d (int): Directions of normal vector.
-        num_surfaces (np.ndarray): Number of surfaces a polygon contains
-            shape of (num_polygon).
-
-    Returns:
-        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
-    """
-    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
-    num_points = points.shape[0]
-    num_polygons = polygon_surfaces.shape[0]
-    ret = np.ones((num_points, num_polygons), dtype=np.bool_)
-    sign = 0.0
-    for i in range(num_points):
-        for j in range(num_polygons):
-            for k in range(max_num_surfaces):
-                if k > num_surfaces[j]:
-                    break
-                sign = (
-                    points[i, 0] * normal_vec[j, k, 0] +
-                    points[i, 1] * normal_vec[j, k, 1] +
-                    points[i, 2] * normal_vec[j, k, 2] + d[j, k])
-                if sign >= 0:
-                    ret[i, j] = False
-                    break
-    return ret
-
-
-def points_in_convex_polygon_3d_jit(points,
-                                    polygon_surfaces,
-                                    num_surfaces=None):
-    """Check points is in 3d convex polygons.
-
-    Args:
-        points (np.ndarray): Input points with shape of (num_points, 3).
-        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
-            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
-            All surfaces' normal vector must direct to internal.
-            Max_num_points_of_surface must at least 3.
-        num_surfaces (np.ndarray, optional): Number of surfaces a polygon
-            contains shape of (num_polygon). Defaults to None.
-
-    Returns:
-        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
-    """
-    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
-    # num_points = points.shape[0]
-    num_polygons = polygon_surfaces.shape[0]
-    if num_surfaces is None:
-        num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64)
-    normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :])
-    # normal_vec: [num_polygon, max_num_surfaces, 3]
-    # d: [num_polygon, max_num_surfaces]
-    return _points_in_convex_polygon_3d_jit(points, polygon_surfaces,
-                                            normal_vec, d, num_surfaces)
-
-
-@numba.njit
-def points_in_convex_polygon_jit(points, polygon, clockwise=False):
-    """Check points is in 2d convex polygons. True when point in polygon.
-
-    Args:
-        points (np.ndarray): Input points with the shape of [num_points, 2].
-        polygon (np.ndarray): Input polygon with the shape of
-            [num_polygon, num_points_of_polygon, 2].
-        clockwise (bool, optional): Indicate polygon is clockwise. Defaults
-            to True.
-
-    Returns:
-        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
-    """
-    # first convert polygon to directed lines
-    num_points_of_polygon = polygon.shape[1]
-    num_points = points.shape[0]
-    num_polygons = polygon.shape[0]
-    # vec for all the polygons
-    if clockwise:
-        vec1 = polygon - polygon[:,
-                                 np.array([num_points_of_polygon - 1] + list(
-                                     range(num_points_of_polygon - 1))), :]
-    else:
-        vec1 = polygon[:,
-                       np.array([num_points_of_polygon - 1] +
-                                list(range(num_points_of_polygon -
-                                           1))), :] - polygon
-    ret = np.zeros((num_points, num_polygons), dtype=np.bool_)
-    success = True
-    cross = 0.0
-    for i in range(num_points):
-        for j in range(num_polygons):
-            success = True
-            for k in range(num_points_of_polygon):
-                vec = vec1[j, k]
-                cross = vec[1] * (polygon[j, k, 0] - points[i, 0])
-                cross -= vec[0] * (polygon[j, k, 1] - points[i, 1])
-                if cross >= 0:
-                    success = False
-                    break
-            ret[i, j] = success
-    return ret
-
-
-def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True):
-    """Convert kitti center boxes to corners.
-
-        7 -------- 4
-       /|         /|
-      6 -------- 5 .
-      | |        | |
-      . 3 -------- 0
-      |/         |/
-      2 -------- 1
-
-    Note:
-        This function is for LiDAR boxes only.
-
-    Args:
-        boxes3d (np.ndarray): Boxes with shape of (N, 7)
-            [x, y, z, x_size, y_size, z_size, ry] in LiDAR coords,
-            see the definition of ry in KITTI dataset.
-        bottom_center (bool, optional): Whether z is on the bottom center
-            of object. Defaults to True.
-
-    Returns:
-        np.ndarray: Box corners with the shape of [N, 8, 3].
-    """
-    boxes_num = boxes3d.shape[0]
-    x_size, y_size, z_size = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5]
-    x_corners = np.array([
-        x_size / 2., -x_size / 2., -x_size / 2., x_size / 2., x_size / 2.,
-        -x_size / 2., -x_size / 2., x_size / 2.
-    ],
-                         dtype=np.float32).T
-    y_corners = np.array([
-        -y_size / 2., -y_size / 2., y_size / 2., y_size / 2., -y_size / 2.,
-        -y_size / 2., y_size / 2., y_size / 2.
-    ],
-                         dtype=np.float32).T
-    if bottom_center:
-        z_corners = np.zeros((boxes_num, 8), dtype=np.float32)
-        z_corners[:, 4:8] = z_size.reshape(boxes_num, 1).repeat(
-            4, axis=1)  # (N, 8)
-    else:
-        z_corners = np.array([
-            -z_size / 2., -z_size / 2., -z_size / 2., -z_size / 2.,
-            z_size / 2., z_size / 2., z_size / 2., z_size / 2.
-        ],
-                             dtype=np.float32).T
-
-    ry = boxes3d[:, 6]
-    zeros, ones = np.zeros(
-        ry.size, dtype=np.float32), np.ones(
-            ry.size, dtype=np.float32)
-    rot_list = np.array([[np.cos(ry), np.sin(ry), zeros],
-                         [-np.sin(ry), np.cos(ry), zeros],
-                         [zeros, zeros, ones]])  # (3, 3, N)
-    R_list = np.transpose(rot_list, (2, 0, 1))  # (N, 3, 3)
-
-    temp_corners = np.concatenate((x_corners.reshape(
-        -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)),
-                                  axis=2)  # (N, 8, 3)
-    rotated_corners = np.matmul(temp_corners, R_list)  # (N, 8, 3)
-    x_corners = rotated_corners[:, :, 0]
-    y_corners = rotated_corners[:, :, 1]
-    z_corners = rotated_corners[:, :, 2]
-
-    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
-
-    x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8)
-    y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8)
-    z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8)
-
-    corners = np.concatenate(
-        (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)),
-        axis=2)
-
-    return corners.astype(np.float32)
+# Copyright (c) OpenMMLab. All rights reserved.
+# TODO: clean the functions in this file and move the APIs into box structures
+# in the future
+# NOTICE: All functions in this file are valid for LiDAR or depth boxes only
+# if we use default parameters.
+
+import numba
+import numpy as np
+
+from .structures.utils import limit_period, points_cam2img, rotation_3d_in_axis
+
+
+def camera_to_lidar(points, r_rect, velo2cam):
+    """Convert points in camera coordinate to lidar coordinate.
+
+    Note:
+        This function is for KITTI only.
+
+    Args:
+        points (np.ndarray, shape=[N, 3]): Points in camera coordinate.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray, shape=[N, 3]: Points in lidar coordinate.
+    """
+    points_shape = list(points.shape[0:-1])
+    if points.shape[-1] == 3:
+        points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1)
+    lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T)
+    return lidar_points[..., :3]
+
+
+def box_camera_to_lidar(data, r_rect, velo2cam):
+    """Convert boxes in camera coordinate to lidar coordinate.
+
+    Note:
+        This function is for KITTI only.
+
+    Args:
+        data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray, shape=[N, 3]: Boxes in lidar coordinate.
+    """
+    xyz = data[:, 0:3]
+    x_size, y_size, z_size = data[:, 3:4], data[:, 4:5], data[:, 5:6]
+    r = data[:, 6:7]
+    xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam)
+    # yaw and dims also needs to be converted
+    r_new = -r - np.pi / 2
+    r_new = limit_period(r_new, period=np.pi * 2)
+    return np.concatenate([xyz_lidar, x_size, z_size, y_size, r_new], axis=1)
+
+
+def corners_nd(dims, origin=0.5):
+    """Generate relative box corners based on length per dim and origin point.
+
+    Args:
+        dims (np.ndarray, shape=[N, ndim]): Array of length per dim
+        origin (list or array or float, optional): origin point relate to
+            smallest point. Defaults to 0.5
+
+    Returns:
+        np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners.
+        point layout example: (2d) x0y0, x0y1, x1y0, x1y1;
+            (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+            where x0 < x1, y0 < y1, z0 < z1.
+    """
+    ndim = int(dims.shape[1])
+    corners_norm = np.stack(
+        np.unravel_index(np.arange(2**ndim), [2] * ndim),
+        axis=1).astype(dims.dtype)
+    # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1
+    # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+    # so need to convert to a format which is convenient to do other computing.
+    # for 2d boxes, format is clockwise start with minimum point
+    # for 3d boxes, please draw lines by your hand.
+    if ndim == 2:
+        # generate clockwise box corners
+        corners_norm = corners_norm[[0, 1, 3, 2]]
+    elif ndim == 3:
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+    corners_norm = corners_norm - np.array(origin, dtype=dims.dtype)
+    corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape(
+        [1, 2**ndim, ndim])
+    return corners
+
+
+def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
+    """Convert kitti locations, dimensions and angles to corners.
+    format: center(xy), dims(xy), angles(counterclockwise when positive)
+
+    Args:
+        centers (np.ndarray): Locations in kitti label file with shape (N, 2).
+        dims (np.ndarray): Dimensions in kitti label file with shape (N, 2).
+        angles (np.ndarray, optional): Rotation_y in kitti label file with
+            shape (N). Defaults to None.
+        origin (list or array or float, optional): origin point relate to
+            smallest point. Defaults to 0.5.
+
+    Returns:
+        np.ndarray: Corners with the shape of (N, 4, 2).
+    """
+    # 'length' in kitti format is in x axis.
+    # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 4, 2]
+    if angles is not None:
+        corners = rotation_3d_in_axis(corners, angles)
+    corners += centers.reshape([-1, 1, 2])
+    return corners
+
+
+@numba.jit(nopython=True)
+def depth_to_points(depth, trunc_pixel):
+    """Convert depth map to points.
+
+    Args:
+        depth (np.array, shape=[H, W]): Depth map which
+            the row of [0~`trunc_pixel`] are truncated.
+        trunc_pixel (int): The number of truncated row.
+
+    Returns:
+        np.ndarray: Points in camera coordinates.
+    """
+    num_pts = np.sum(depth[trunc_pixel:, ] > 0.1)
+    points = np.zeros((num_pts, 3), dtype=depth.dtype)
+    x = np.array([0, 0, 1], dtype=depth.dtype)
+    k = 0
+    for i in range(trunc_pixel, depth.shape[0]):
+        for j in range(depth.shape[1]):
+            if depth[i, j] > 0.1:
+                x = np.array([j, i, 1], dtype=depth.dtype)
+                points[k] = x * depth[i, j]
+                k += 1
+    return points
+
+
+def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam):
+    """Convert depth map to points in lidar coordinate.
+
+    Args:
+        depth (np.array, shape=[H, W]): Depth map which
+            the row of [0~`trunc_pixel`] are truncated.
+        trunc_pixel (int): The number of truncated row.
+        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray: Points in lidar coordinates.
+    """
+    pts = depth_to_points(depth, trunc_pixel)
+    points_shape = list(pts.shape[0:-1])
+    points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1)
+    points = points @ np.linalg.inv(P2.T)
+    lidar_points = camera_to_lidar(points, r_rect, velo2cam)
+    return lidar_points
+
+
+def center_to_corner_box3d(centers,
+                           dims,
+                           angles=None,
+                           origin=(0.5, 1.0, 0.5),
+                           axis=1):
+    """Convert kitti locations, dimensions and angles to corners.
+
+    Args:
+        centers (np.ndarray): Locations in kitti label file with shape (N, 3).
+        dims (np.ndarray): Dimensions in kitti label file with shape (N, 3).
+        angles (np.ndarray, optional): Rotation_y in kitti label file with
+            shape (N). Defaults to None.
+        origin (list or array or float, optional): Origin point relate to
+            smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0)
+            in lidar. Defaults to (0.5, 1.0, 0.5).
+        axis (int, optional): Rotation axis. 1 for camera and 2 for lidar.
+            Defaults to 1.
+
+    Returns:
+        np.ndarray: Corners with the shape of (N, 8, 3).
+    """
+    # 'length' in kitti format is in x axis.
+    # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(lwh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 8, 3]
+    if angles is not None:
+        corners = rotation_3d_in_axis(corners, angles, axis=axis)
+    corners += centers.reshape([-1, 1, 3])
+    return corners
+
+
+@numba.jit(nopython=True)
+def box2d_to_corner_jit(boxes):
+    """Convert box2d to corner.
+
+    Args:
+        boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation.
+
+    Returns:
+        box_corners (np.ndarray, shape=[N, 4, 2]): Box corners.
+    """
+    num_box = boxes.shape[0]
+    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
+    corners_norm[1, 1] = 1.0
+    corners_norm[2] = 1.0
+    corners_norm[3, 0] = 1.0
+    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
+    corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(
+        1, 4, 2)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype)
+    for i in range(num_box):
+        rot_sin = np.sin(boxes[i, -1])
+        rot_cos = np.cos(boxes[i, -1])
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 1] = rot_sin
+        rot_mat_T[1, 0] = -rot_sin
+        rot_mat_T[1, 1] = rot_cos
+        box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2]
+    return box_corners
+
+
+@numba.njit
+def corner_to_standup_nd_jit(boxes_corner):
+    """Convert boxes_corner to aligned (min-max) boxes.
+
+    Args:
+        boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners.
+
+    Returns:
+        np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes.
+    """
+    num_boxes = boxes_corner.shape[0]
+    ndim = boxes_corner.shape[-1]
+    result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype)
+    for i in range(num_boxes):
+        for j in range(ndim):
+            result[i, j] = np.min(boxes_corner[i, :, j])
+        for j in range(ndim):
+            result[i, j + ndim] = np.max(boxes_corner[i, :, j])
+    return result
+
+
+@numba.jit(nopython=True)
+def corner_to_surfaces_3d_jit(corners):
+    """Convert 3d box corners from corner function above to surfaces that
+    normal vectors all direct to internal.
+
+    Args:
+        corners (np.ndarray): 3d box corners with the shape of (N, 8, 3).
+
+    Returns:
+        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
+    """
+    # box_corners: [N, 8, 3], must from corner functions in this module
+    num_boxes = corners.shape[0]
+    surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype)
+    corner_idxes = np.array([
+        0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7
+    ]).reshape(6, 4)
+    for i in range(num_boxes):
+        for j in range(6):
+            for k in range(4):
+                surfaces[i, j, k] = corners[i, corner_idxes[j, k]]
+    return surfaces
+
+
+def rotation_points_single_angle(points, angle, axis=0):
+    """Rotate points with a single angle.
+
+    Args:
+        points (np.ndarray, shape=[N, 3]]):
+        angle (np.ndarray, shape=[1]]):
+        axis (int, optional): Axis to rotate at. Defaults to 0.
+
+    Returns:
+        np.ndarray: Rotated points.
+    """
+    # points: [N, 3]
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    if axis == 1:
+        rot_mat_T = np.array(
+            [[rot_cos, 0, rot_sin], [0, 1, 0], [-rot_sin, 0, rot_cos]],
+            dtype=points.dtype)
+    elif axis == 2 or axis == -1:
+        rot_mat_T = np.array(
+            [[rot_cos, rot_sin, 0], [-rot_sin, rot_cos, 0], [0, 0, 1]],
+            dtype=points.dtype)
+    elif axis == 0:
+        rot_mat_T = np.array(
+            [[1, 0, 0], [0, rot_cos, rot_sin], [0, -rot_sin, rot_cos]],
+            dtype=points.dtype)
+    else:
+        raise ValueError('axis should in range')
+
+    return points @ rot_mat_T, rot_mat_T
+
+
+def box3d_to_bbox(box3d, P2):
+    """Convert box3d in camera coordinates to bbox in image coordinates.
+
+    Args:
+        box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
+        P2 (np.array, shape=[4, 4]): Intrinsics of Camera2.
+
+    Returns:
+        np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates.
+    """
+    box_corners = center_to_corner_box3d(
+        box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1)
+    box_corners_in_image = points_cam2img(box_corners, P2)
+    # box_corners_in_image: [N, 8, 2]
+    minxy = np.min(box_corners_in_image, axis=1)
+    maxxy = np.max(box_corners_in_image, axis=1)
+    bbox = np.concatenate([minxy, maxxy], axis=1)
+    return bbox
+
+
+def corner_to_surfaces_3d(corners):
+    """convert 3d box corners from corner function above to surfaces that
+    normal vectors all direct to internal.
+
+    Args:
+        corners (np.ndarray): 3D box corners with shape of (N, 8, 3).
+
+    Returns:
+        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
+    """
+    # box_corners: [N, 8, 3], must from corner functions in this module
+    surfaces = np.array([
+        [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]],
+        [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]],
+        [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]],
+        [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]],
+        [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]],
+        [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]],
+    ]).transpose([2, 0, 1, 3])
+    return surfaces
+
+
+def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)):
+    """Check points in rotated bbox and return indices.
+
+    Note:
+        This function is for counterclockwise boxes.
+
+    Args:
+        points (np.ndarray, shape=[N, 3+dim]): Points to query.
+        rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation.
+        z_axis (int, optional): Indicate which axis is height.
+            Defaults to 2.
+        origin (tuple[int], optional): Indicate the position of
+            box center. Defaults to (0.5, 0.5, 0).
+
+    Returns:
+        np.ndarray, shape=[N, M]: Indices of points in each box.
+    """
+    # TODO: this function is different from PointCloud3D, be careful
+    # when start to use nuscene, check the input
+    rbbox_corners = center_to_corner_box3d(
+        rbbox[:, :3], rbbox[:, 3:6], rbbox[:, 6], origin=origin, axis=z_axis)
+    surfaces = corner_to_surfaces_3d(rbbox_corners)
+    indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces)
+    return indices
+
+
+def minmax_to_corner_2d(minmax_box):
+    """Convert minmax box to corners2d.
+
+    Args:
+        minmax_box (np.ndarray, shape=[N, dims]): minmax boxes.
+
+    Returns:
+        np.ndarray: 2d corners of boxes
+    """
+    ndim = minmax_box.shape[-1] // 2
+    center = minmax_box[..., :ndim]
+    dims = minmax_box[..., ndim:] - center
+    return center_to_corner_box2d(center, dims, origin=0.0)
+
+
+def create_anchors_3d_range(feature_size,
+                            anchor_range,
+                            sizes=((3.9, 1.6, 1.56), ),
+                            rotations=(0, np.pi / 2),
+                            dtype=np.float32):
+    """Create anchors 3d by range.
+
+    Args:
+        feature_size (list[float] | tuple[float]): Feature map size. It is
+            either a list of a tuple of [D, H, W](in order of z, y, and x).
+        anchor_range (torch.Tensor | list[float]): Range of anchors with
+            shape [6]. The order is consistent with that of anchors, i.e.,
+            (x_min, y_min, z_min, x_max, y_max, z_max).
+        sizes (list[list] | np.ndarray | torch.Tensor, optional):
+            Anchor size with shape [N, 3], in order of x, y, z.
+            Defaults to ((3.9, 1.6, 1.56), ).
+        rotations (list[float] | np.ndarray | torch.Tensor, optional):
+            Rotations of anchors in a single feature grid.
+            Defaults to (0, np.pi / 2).
+        dtype (type, optional): Data type. Defaults to np.float32.
+
+    Returns:
+        np.ndarray: Range based anchors with shape of
+            (*feature_size, num_sizes, num_rots, 7).
+    """
+    anchor_range = np.array(anchor_range, dtype)
+    z_centers = np.linspace(
+        anchor_range[2], anchor_range[5], feature_size[0], dtype=dtype)
+    y_centers = np.linspace(
+        anchor_range[1], anchor_range[4], feature_size[1], dtype=dtype)
+    x_centers = np.linspace(
+        anchor_range[0], anchor_range[3], feature_size[2], dtype=dtype)
+    sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3])
+    rotations = np.array(rotations, dtype=dtype)
+    rets = np.meshgrid(
+        x_centers, y_centers, z_centers, rotations, indexing='ij')
+    tile_shape = [1] * 5
+    tile_shape[-2] = int(sizes.shape[0])
+    for i in range(len(rets)):
+        rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
+        rets[i] = rets[i][..., np.newaxis]  # for concat
+    sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
+    tile_size_shape = list(rets[0].shape)
+    tile_size_shape[3] = 1
+    sizes = np.tile(sizes, tile_size_shape)
+    rets.insert(3, sizes)
+    ret = np.concatenate(rets, axis=-1)
+    return np.transpose(ret, [2, 1, 0, 3, 4, 5])
+
+
+def center_to_minmax_2d(centers, dims, origin=0.5):
+    """Center to minmax.
+
+    Args:
+        centers (np.ndarray): Center points.
+        dims (np.ndarray): Dimensions.
+        origin (list or array or float, optional): Origin point relate
+            to smallest point. Defaults to 0.5.
+
+    Returns:
+        np.ndarray: Minmax points.
+    """
+    if origin == 0.5:
+        return np.concatenate([centers - dims / 2, centers + dims / 2],
+                              axis=-1)
+    corners = center_to_corner_box2d(centers, dims, origin=origin)
+    return corners[:, [0, 2]].reshape([-1, 4])
+
+
+def rbbox2d_to_near_bbox(rbboxes):
+    """convert rotated bbox to nearest 'standing' or 'lying' bbox.
+
+    Args:
+        rbboxes (np.ndarray): Rotated bboxes with shape of
+            (N, 5(x, y, xdim, ydim, rad)).
+
+    Returns:
+        np.ndarray: Bounding boxes with the shape of
+            (N, 4(xmin, ymin, xmax, ymax)).
+    """
+    rots = rbboxes[..., -1]
+    rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi))
+    cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis]
+    bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4])
+    bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:])
+    return bboxes
+
+
+@numba.jit(nopython=True)
+def iou_jit(boxes, query_boxes, mode='iou', eps=0.0):
+    """Calculate box iou. Note that jit version runs ~10x faster than the
+    box_overlaps function in mmdet3d.core.evaluation.
+
+    Note:
+        This function is for counterclockwise boxes.
+
+    Args:
+        boxes (np.ndarray): Input bounding boxes with shape of (N, 4).
+        query_boxes (np.ndarray): Query boxes with shape of (K, 4).
+        mode (str, optional): IoU mode. Defaults to 'iou'.
+        eps (float, optional): Value added to denominator. Defaults to 0.
+
+    Returns:
+        np.ndarray: Overlap between boxes and query_boxes
+            with the shape of [N, K].
+    """
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) *
+                    (query_boxes[k, 3] - query_boxes[k, 1] + eps))
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) + eps)
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) + eps)
+                if ih > 0:
+                    if mode == 'iou':
+                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+                              (boxes[n, 3] - boxes[n, 1] + eps) + box_area -
+                              iw * ih)
+                    else:
+                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+                              (boxes[n, 3] - boxes[n, 1] + eps))
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def projection_matrix_to_CRT_kitti(proj):
+    """Split projection matrix of KITTI.
+
+    Note:
+        This function is for KITTI only.
+
+    P = C @ [R|T]
+    C is upper triangular matrix, so we need to inverse CR and use QR
+    stable for all kitti camera projection matrix.
+
+    Args:
+        proj (p.array, shape=[4, 4]): Intrinsics of camera.
+
+    Returns:
+        tuple[np.ndarray]: Splited matrix of C, R and T.
+    """
+
+    CR = proj[0:3, 0:3]
+    CT = proj[0:3, 3]
+    RinvCinv = np.linalg.inv(CR)
+    Rinv, Cinv = np.linalg.qr(RinvCinv)
+    C = np.linalg.inv(Cinv)
+    R = np.linalg.inv(Rinv)
+    T = Cinv @ CT
+    return C, R, T
+
+
+def remove_outside_points(points, rect, Trv2c, P2, image_shape):
+    """Remove points which are outside of image.
+
+    Note:
+        This function is for KITTI only.
+
+    Args:
+        points (np.ndarray, shape=[N, 3+dims]): Total points.
+        rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
+        image_shape (list[int]): Shape of image.
+
+    Returns:
+        np.ndarray, shape=[N, 3+dims]: Filtered points.
+    """
+    # 5x faster than remove_outside_points_v1(2ms vs 10ms)
+    C, R, T = projection_matrix_to_CRT_kitti(P2)
+    image_bbox = [0, 0, image_shape[1], image_shape[0]]
+    frustum = get_frustum(image_bbox, C)
+    frustum -= T
+    frustum = np.linalg.inv(R) @ frustum.T
+    frustum = camera_to_lidar(frustum.T, rect, Trv2c)
+    frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...])
+    indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces)
+    points = points[indices.reshape([-1])]
+    return points
+
+
+def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
+    """Get frustum corners in camera coordinates.
+
+    Args:
+        bbox_image (list[int]): box in image coordinates.
+        C (np.ndarray): Intrinsics.
+        near_clip (float, optional): Nearest distance of frustum.
+            Defaults to 0.001.
+        far_clip (float, optional): Farthest distance of frustum.
+            Defaults to 100.
+
+    Returns:
+        np.ndarray, shape=[8, 3]: coordinates of frustum corners.
+    """
+    fku = C[0, 0]
+    fkv = -C[1, 1]
+    u0v0 = C[0:2, 2]
+    z_points = np.array(
+        [near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis]
+    b = bbox_image
+    box_corners = np.array(
+        [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]],
+        dtype=C.dtype)
+    near_box_corners = (box_corners - u0v0) / np.array(
+        [fku / near_clip, -fkv / near_clip], dtype=C.dtype)
+    far_box_corners = (box_corners - u0v0) / np.array(
+        [fku / far_clip, -fkv / far_clip], dtype=C.dtype)
+    ret_xy = np.concatenate([near_box_corners, far_box_corners],
+                            axis=0)  # [8, 2]
+    ret_xyz = np.concatenate([ret_xy, z_points], axis=1)
+    return ret_xyz
+
+
+def surface_equ_3d(polygon_surfaces):
+    """
+
+    Args:
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            [num_polygon, max_num_surfaces, max_num_points_of_surface, 3].
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+
+    Returns:
+        tuple: normal vector and its direction.
+    """
+    # return [a, b, c], d in ax+by+cz+d=0
+    # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
+    surface_vec = polygon_surfaces[:, :, :2, :] - \
+        polygon_surfaces[:, :, 1:3, :]
+    # normal_vec: [..., 3]
+    normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
+    # print(normal_vec.shape, points[..., 0, :].shape)
+    # d = -np.inner(normal_vec, points[..., 0, :])
+    d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :])
+    return normal_vec, -d
+
+
+@numba.njit
+def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d,
+                                     num_surfaces):
+    """
+    Args:
+        points (np.ndarray): Input points with shape of (num_points, 3).
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+        normal_vec (np.ndarray): Normal vector of polygon_surfaces.
+        d (int): Directions of normal vector.
+        num_surfaces (np.ndarray): Number of surfaces a polygon contains
+            shape of (num_polygon).
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+    num_points = points.shape[0]
+    num_polygons = polygon_surfaces.shape[0]
+    ret = np.ones((num_points, num_polygons), dtype=np.bool_)
+    sign = 0.0
+    for i in range(num_points):
+        for j in range(num_polygons):
+            for k in range(max_num_surfaces):
+                if k > num_surfaces[j]:
+                    break
+                sign = (
+                    points[i, 0] * normal_vec[j, k, 0] +
+                    points[i, 1] * normal_vec[j, k, 1] +
+                    points[i, 2] * normal_vec[j, k, 2] + d[j, k])
+                if sign >= 0:
+                    ret[i, j] = False
+                    break
+    return ret
+
+
+def points_in_convex_polygon_3d_jit(points,
+                                    polygon_surfaces,
+                                    num_surfaces=None):
+    """Check points is in 3d convex polygons.
+
+    Args:
+        points (np.ndarray): Input points with shape of (num_points, 3).
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+        num_surfaces (np.ndarray, optional): Number of surfaces a polygon
+            contains shape of (num_polygon). Defaults to None.
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+    # num_points = points.shape[0]
+    num_polygons = polygon_surfaces.shape[0]
+    if num_surfaces is None:
+        num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64)
+    normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :])
+    # normal_vec: [num_polygon, max_num_surfaces, 3]
+    # d: [num_polygon, max_num_surfaces]
+    return _points_in_convex_polygon_3d_jit(points, polygon_surfaces,
+                                            normal_vec, d, num_surfaces)
+
+
+@numba.njit
+def points_in_convex_polygon_jit(points, polygon, clockwise=False):
+    """Check points is in 2d convex polygons. True when point in polygon.
+
+    Args:
+        points (np.ndarray): Input points with the shape of [num_points, 2].
+        polygon (np.ndarray): Input polygon with the shape of
+            [num_polygon, num_points_of_polygon, 2].
+        clockwise (bool, optional): Indicate polygon is clockwise. Defaults
+            to True.
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    # first convert polygon to directed lines
+    num_points_of_polygon = polygon.shape[1]
+    num_points = points.shape[0]
+    num_polygons = polygon.shape[0]
+    # vec for all the polygons
+    if clockwise:
+        vec1 = polygon - polygon[:,
+                                 np.array([num_points_of_polygon - 1] + list(
+                                     range(num_points_of_polygon - 1))), :]
+    else:
+        vec1 = polygon[:,
+                       np.array([num_points_of_polygon - 1] +
+                                list(range(num_points_of_polygon -
+                                           1))), :] - polygon
+    ret = np.zeros((num_points, num_polygons), dtype=np.bool_)
+    success = True
+    cross = 0.0
+    for i in range(num_points):
+        for j in range(num_polygons):
+            success = True
+            for k in range(num_points_of_polygon):
+                vec = vec1[j, k]
+                cross = vec[1] * (polygon[j, k, 0] - points[i, 0])
+                cross -= vec[0] * (polygon[j, k, 1] - points[i, 1])
+                if cross >= 0:
+                    success = False
+                    break
+            ret[i, j] = success
+    return ret
+
+
+def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True):
+    """Convert kitti center boxes to corners.
+
+        7 -------- 4
+       /|         /|
+      6 -------- 5 .
+      | |        | |
+      . 3 -------- 0
+      |/         |/
+      2 -------- 1
+
+    Note:
+        This function is for LiDAR boxes only.
+
+    Args:
+        boxes3d (np.ndarray): Boxes with shape of (N, 7)
+            [x, y, z, x_size, y_size, z_size, ry] in LiDAR coords,
+            see the definition of ry in KITTI dataset.
+        bottom_center (bool, optional): Whether z is on the bottom center
+            of object. Defaults to True.
+
+    Returns:
+        np.ndarray: Box corners with the shape of [N, 8, 3].
+    """
+    boxes_num = boxes3d.shape[0]
+    x_size, y_size, z_size = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5]
+    x_corners = np.array([
+        x_size / 2., -x_size / 2., -x_size / 2., x_size / 2., x_size / 2.,
+        -x_size / 2., -x_size / 2., x_size / 2.
+    ],
+                         dtype=np.float32).T
+    y_corners = np.array([
+        -y_size / 2., -y_size / 2., y_size / 2., y_size / 2., -y_size / 2.,
+        -y_size / 2., y_size / 2., y_size / 2.
+    ],
+                         dtype=np.float32).T
+    if bottom_center:
+        z_corners = np.zeros((boxes_num, 8), dtype=np.float32)
+        z_corners[:, 4:8] = z_size.reshape(boxes_num, 1).repeat(
+            4, axis=1)  # (N, 8)
+    else:
+        z_corners = np.array([
+            -z_size / 2., -z_size / 2., -z_size / 2., -z_size / 2.,
+            z_size / 2., z_size / 2., z_size / 2., z_size / 2.
+        ],
+                             dtype=np.float32).T
+
+    ry = boxes3d[:, 6]
+    zeros, ones = np.zeros(
+        ry.size, dtype=np.float32), np.ones(
+            ry.size, dtype=np.float32)
+    rot_list = np.array([[np.cos(ry), np.sin(ry), zeros],
+                         [-np.sin(ry), np.cos(ry), zeros],
+                         [zeros, zeros, ones]])  # (3, 3, N)
+    R_list = np.transpose(rot_list, (2, 0, 1))  # (N, 3, 3)
+
+    temp_corners = np.concatenate((x_corners.reshape(
+        -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)),
+                                  axis=2)  # (N, 8, 3)
+    rotated_corners = np.matmul(temp_corners, R_list)  # (N, 8, 3)
+    x_corners = rotated_corners[:, :, 0]
+    y_corners = rotated_corners[:, :, 1]
+    z_corners = rotated_corners[:, :, 2]
+
+    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
+
+    x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8)
+    y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8)
+    z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8)
+
+    corners = np.concatenate(
+        (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)),
+        axis=2)
+
+    return corners.astype(np.float32)
diff --git a/mmdet3d/core/bbox/coders/__init__.py b/mmdet3d/core/bbox/coders/__init__.py
index b306525..06f8142 100644
--- a/mmdet3d/core/bbox/coders/__init__.py
+++ b/mmdet3d/core/bbox/coders/__init__.py
@@ -1,19 +1,19 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.core.bbox import build_bbox_coder
-from .anchor_free_bbox_coder import AnchorFreeBBoxCoder
-from .centerpoint_bbox_coders import CenterPointBBoxCoder
-from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder
-from .fcos3d_bbox_coder import FCOS3DBBoxCoder
-from .groupfree3d_bbox_coder import GroupFree3DBBoxCoder
-from .monoflex_bbox_coder import MonoFlexCoder
-from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
-from .pgd_bbox_coder import PGDBBoxCoder
-from .point_xyzwhlr_bbox_coder import PointXYZWHLRBBoxCoder
-from .smoke_bbox_coder import SMOKECoder
-
-__all__ = [
-    'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder',
-    'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'GroupFree3DBBoxCoder',
-    'PointXYZWHLRBBoxCoder', 'FCOS3DBBoxCoder', 'PGDBBoxCoder', 'SMOKECoder',
-    'MonoFlexCoder'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.core.bbox import build_bbox_coder
+from .anchor_free_bbox_coder import AnchorFreeBBoxCoder
+from .centerpoint_bbox_coders import CenterPointBBoxCoder
+from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder
+from .fcos3d_bbox_coder import FCOS3DBBoxCoder
+from .groupfree3d_bbox_coder import GroupFree3DBBoxCoder
+from .monoflex_bbox_coder import MonoFlexCoder
+from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
+from .pgd_bbox_coder import PGDBBoxCoder
+from .point_xyzwhlr_bbox_coder import PointXYZWHLRBBoxCoder
+from .smoke_bbox_coder import SMOKECoder
+
+__all__ = [
+    'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder',
+    'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'GroupFree3DBBoxCoder',
+    'PointXYZWHLRBBoxCoder', 'FCOS3DBBoxCoder', 'PGDBBoxCoder', 'SMOKECoder',
+    'MonoFlexCoder'
+]
diff --git a/mmdet3d/core/bbox/coders/anchor_free_bbox_coder.py b/mmdet3d/core/bbox/coders/anchor_free_bbox_coder.py
index d64f38b..f100ef0 100644
--- a/mmdet3d/core/bbox/coders/anchor_free_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/anchor_free_bbox_coder.py
@@ -1,130 +1,130 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from mmdet.core.bbox.builder import BBOX_CODERS
-from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
-
-
-@BBOX_CODERS.register_module()
-class AnchorFreeBBoxCoder(PartialBinBasedBBoxCoder):
-    """Anchor free bbox coder for 3D boxes.
-
-    Args:
-        num_dir_bins (int): Number of bins to encode direction angle.
-        with_rot (bool): Whether the bbox is with rotation.
-    """
-
-    def __init__(self, num_dir_bins, with_rot=True):
-        super(AnchorFreeBBoxCoder, self).__init__(
-            num_dir_bins, 0, [], with_rot=with_rot)
-        self.num_dir_bins = num_dir_bins
-        self.with_rot = with_rot
-
-    def encode(self, gt_bboxes_3d, gt_labels_3d):
-        """Encode ground truth to prediction targets.
-
-        Args:
-            gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes
-                with shape (n, 7).
-            gt_labels_3d (torch.Tensor): Ground truth classes.
-
-        Returns:
-            tuple: Targets of center, size and direction.
-        """
-        # generate center target
-        center_target = gt_bboxes_3d.gravity_center
-
-        # generate bbox size target
-        size_res_target = gt_bboxes_3d.dims / 2
-
-        # generate dir target
-        box_num = gt_labels_3d.shape[0]
-        if self.with_rot:
-            (dir_class_target,
-             dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
-            dir_res_target /= (2 * np.pi / self.num_dir_bins)
-        else:
-            dir_class_target = gt_labels_3d.new_zeros(box_num)
-            dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
-
-        return (center_target, size_res_target, dir_class_target,
-                dir_res_target)
-
-    def decode(self, bbox_out):
-        """Decode predicted parts to bbox3d.
-
-        Args:
-            bbox_out (dict): Predictions from model, should contain keys below.
-
-                - center: predicted bottom center of bboxes.
-                - dir_class: predicted bbox direction class.
-                - dir_res: predicted bbox direction residual.
-                - size: predicted bbox size.
-
-        Returns:
-            torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
-        """
-        center = bbox_out['center']
-        batch_size, num_proposal = center.shape[:2]
-
-        # decode heading angle
-        if self.with_rot:
-            dir_class = torch.argmax(bbox_out['dir_class'], -1)
-            dir_res = torch.gather(bbox_out['dir_res'], 2,
-                                   dir_class.unsqueeze(-1))
-            dir_res.squeeze_(2)
-            dir_angle = self.class2angle(dir_class, dir_res).reshape(
-                batch_size, num_proposal, 1)
-        else:
-            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
-
-        # decode bbox size
-        bbox_size = torch.clamp(bbox_out['size'] * 2, min=0.1)
-
-        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
-        return bbox3d
-
-    def split_pred(self, cls_preds, reg_preds, base_xyz):
-        """Split predicted features to specific parts.
-
-        Args:
-            cls_preds (torch.Tensor): Class predicted features to split.
-            reg_preds (torch.Tensor): Regression predicted features to split.
-            base_xyz (torch.Tensor): Coordinates of points.
-
-        Returns:
-            dict[str, torch.Tensor]: Split results.
-        """
-        results = {}
-        results['obj_scores'] = cls_preds
-
-        start, end = 0, 0
-        reg_preds_trans = reg_preds.transpose(2, 1)
-
-        # decode center
-        end += 3
-        # (batch_size, num_proposal, 3)
-        results['center_offset'] = reg_preds_trans[..., start:end]
-        results['center'] = base_xyz.detach() + reg_preds_trans[..., start:end]
-        start = end
-
-        # decode center
-        end += 3
-        # (batch_size, num_proposal, 3)
-        results['size'] = reg_preds_trans[..., start:end]
-        start = end
-
-        # decode direction
-        end += self.num_dir_bins
-        results['dir_class'] = reg_preds_trans[..., start:end]
-        start = end
-
-        end += self.num_dir_bins
-        dir_res_norm = reg_preds_trans[..., start:end]
-        start = end
-
-        results['dir_res_norm'] = dir_res_norm
-        results['dir_res'] = dir_res_norm * (2 * np.pi / self.num_dir_bins)
-
-        return results
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.core.bbox.builder import BBOX_CODERS
+from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class AnchorFreeBBoxCoder(PartialBinBasedBBoxCoder):
+    """Anchor free bbox coder for 3D boxes.
+
+    Args:
+        num_dir_bins (int): Number of bins to encode direction angle.
+        with_rot (bool): Whether the bbox is with rotation.
+    """
+
+    def __init__(self, num_dir_bins, with_rot=True):
+        super(AnchorFreeBBoxCoder, self).__init__(
+            num_dir_bins, 0, [], with_rot=with_rot)
+        self.num_dir_bins = num_dir_bins
+        self.with_rot = with_rot
+
+    def encode(self, gt_bboxes_3d, gt_labels_3d):
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes
+                with shape (n, 7).
+            gt_labels_3d (torch.Tensor): Ground truth classes.
+
+        Returns:
+            tuple: Targets of center, size and direction.
+        """
+        # generate center target
+        center_target = gt_bboxes_3d.gravity_center
+
+        # generate bbox size target
+        size_res_target = gt_bboxes_3d.dims / 2
+
+        # generate dir target
+        box_num = gt_labels_3d.shape[0]
+        if self.with_rot:
+            (dir_class_target,
+             dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
+            dir_res_target /= (2 * np.pi / self.num_dir_bins)
+        else:
+            dir_class_target = gt_labels_3d.new_zeros(box_num)
+            dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
+
+        return (center_target, size_res_target, dir_class_target,
+                dir_res_target)
+
+    def decode(self, bbox_out):
+        """Decode predicted parts to bbox3d.
+
+        Args:
+            bbox_out (dict): Predictions from model, should contain keys below.
+
+                - center: predicted bottom center of bboxes.
+                - dir_class: predicted bbox direction class.
+                - dir_res: predicted bbox direction residual.
+                - size: predicted bbox size.
+
+        Returns:
+            torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
+        """
+        center = bbox_out['center']
+        batch_size, num_proposal = center.shape[:2]
+
+        # decode heading angle
+        if self.with_rot:
+            dir_class = torch.argmax(bbox_out['dir_class'], -1)
+            dir_res = torch.gather(bbox_out['dir_res'], 2,
+                                   dir_class.unsqueeze(-1))
+            dir_res.squeeze_(2)
+            dir_angle = self.class2angle(dir_class, dir_res).reshape(
+                batch_size, num_proposal, 1)
+        else:
+            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
+
+        # decode bbox size
+        bbox_size = torch.clamp(bbox_out['size'] * 2, min=0.1)
+
+        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
+        return bbox3d
+
+    def split_pred(self, cls_preds, reg_preds, base_xyz):
+        """Split predicted features to specific parts.
+
+        Args:
+            cls_preds (torch.Tensor): Class predicted features to split.
+            reg_preds (torch.Tensor): Regression predicted features to split.
+            base_xyz (torch.Tensor): Coordinates of points.
+
+        Returns:
+            dict[str, torch.Tensor]: Split results.
+        """
+        results = {}
+        results['obj_scores'] = cls_preds
+
+        start, end = 0, 0
+        reg_preds_trans = reg_preds.transpose(2, 1)
+
+        # decode center
+        end += 3
+        # (batch_size, num_proposal, 3)
+        results['center_offset'] = reg_preds_trans[..., start:end]
+        results['center'] = base_xyz.detach() + reg_preds_trans[..., start:end]
+        start = end
+
+        # decode center
+        end += 3
+        # (batch_size, num_proposal, 3)
+        results['size'] = reg_preds_trans[..., start:end]
+        start = end
+
+        # decode direction
+        end += self.num_dir_bins
+        results['dir_class'] = reg_preds_trans[..., start:end]
+        start = end
+
+        end += self.num_dir_bins
+        dir_res_norm = reg_preds_trans[..., start:end]
+        start = end
+
+        results['dir_res_norm'] = dir_res_norm
+        results['dir_res'] = dir_res_norm * (2 * np.pi / self.num_dir_bins)
+
+        return results
diff --git a/mmdet3d/core/bbox/coders/centerpoint_bbox_coders.py b/mmdet3d/core/bbox/coders/centerpoint_bbox_coders.py
index 6d43a63..7622d55 100644
--- a/mmdet3d/core/bbox/coders/centerpoint_bbox_coders.py
+++ b/mmdet3d/core/bbox/coders/centerpoint_bbox_coders.py
@@ -1,229 +1,229 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet.core.bbox import BaseBBoxCoder
-from mmdet.core.bbox.builder import BBOX_CODERS
-
-
-@BBOX_CODERS.register_module()
-class CenterPointBBoxCoder(BaseBBoxCoder):
-    """Bbox coder for CenterPoint.
-
-    Args:
-        pc_range (list[float]): Range of point cloud.
-        out_size_factor (int): Downsample factor of the model.
-        voxel_size (list[float]): Size of voxel.
-        post_center_range (list[float], optional): Limit of the center.
-            Default: None.
-        max_num (int, optional): Max number to be kept. Default: 100.
-        score_threshold (float, optional): Threshold to filter boxes
-            based on score. Default: None.
-        code_size (int, optional): Code size of bboxes. Default: 9
-    """
-
-    def __init__(self,
-                 pc_range,
-                 out_size_factor,
-                 voxel_size,
-                 post_center_range=None,
-                 max_num=100,
-                 score_threshold=None,
-                 code_size=9):
-
-        self.pc_range = pc_range
-        self.out_size_factor = out_size_factor
-        self.voxel_size = voxel_size
-        self.post_center_range = post_center_range
-        self.max_num = max_num
-        self.score_threshold = score_threshold
-        self.code_size = code_size
-
-    def _gather_feat(self, feats, inds, feat_masks=None):
-        """Given feats and indexes, returns the gathered feats.
-
-        Args:
-            feats (torch.Tensor): Features to be transposed and gathered
-                with the shape of [B, 2, W, H].
-            inds (torch.Tensor): Indexes with the shape of [B, N].
-            feat_masks (torch.Tensor, optional): Mask of the feats.
-                Default: None.
-
-        Returns:
-            torch.Tensor: Gathered feats.
-        """
-        dim = feats.size(2)
-        inds = inds.unsqueeze(2).expand(inds.size(0), inds.size(1), dim)
-        feats = feats.gather(1, inds)
-        if feat_masks is not None:
-            feat_masks = feat_masks.unsqueeze(2).expand_as(feats)
-            feats = feats[feat_masks]
-            feats = feats.view(-1, dim)
-        return feats
-
-    def _topk(self, scores, K=80):
-        """Get indexes based on scores.
-
-        Args:
-            scores (torch.Tensor): scores with the shape of [B, N, W, H].
-            K (int, optional): Number to be kept. Defaults to 80.
-
-        Returns:
-            tuple[torch.Tensor]
-                torch.Tensor: Selected scores with the shape of [B, K].
-                torch.Tensor: Selected indexes with the shape of [B, K].
-                torch.Tensor: Selected classes with the shape of [B, K].
-                torch.Tensor: Selected y coord with the shape of [B, K].
-                torch.Tensor: Selected x coord with the shape of [B, K].
-        """
-        batch, cat, height, width = scores.size()
-
-        topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
-
-        topk_inds = topk_inds % (height * width)
-        topk_ys = (topk_inds.float() /
-                   torch.tensor(width, dtype=torch.float)).int().float()
-        topk_xs = (topk_inds % width).int().float()
-
-        topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
-        topk_clses = (topk_ind / torch.tensor(K, dtype=torch.float)).int()
-        topk_inds = self._gather_feat(topk_inds.view(batch, -1, 1),
-                                      topk_ind).view(batch, K)
-        topk_ys = self._gather_feat(topk_ys.view(batch, -1, 1),
-                                    topk_ind).view(batch, K)
-        topk_xs = self._gather_feat(topk_xs.view(batch, -1, 1),
-                                    topk_ind).view(batch, K)
-
-        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
-
-    def _transpose_and_gather_feat(self, feat, ind):
-        """Given feats and indexes, returns the transposed and gathered feats.
-
-        Args:
-            feat (torch.Tensor): Features to be transposed and gathered
-                with the shape of [B, 2, W, H].
-            ind (torch.Tensor): Indexes with the shape of [B, N].
-
-        Returns:
-            torch.Tensor: Transposed and gathered feats.
-        """
-        feat = feat.permute(0, 2, 3, 1).contiguous()
-        feat = feat.view(feat.size(0), -1, feat.size(3))
-        feat = self._gather_feat(feat, ind)
-        return feat
-
-    def encode(self):
-        pass
-
-    def decode(self,
-               heat,
-               rot_sine,
-               rot_cosine,
-               hei,
-               dim,
-               vel,
-               reg=None,
-               task_id=-1):
-        """Decode bboxes.
-
-        Args:
-            heat (torch.Tensor): Heatmap with the shape of [B, N, W, H].
-            rot_sine (torch.Tensor): Sine of rotation with the shape of
-                [B, 1, W, H].
-            rot_cosine (torch.Tensor): Cosine of rotation with the shape of
-                [B, 1, W, H].
-            hei (torch.Tensor): Height of the boxes with the shape
-                of [B, 1, W, H].
-            dim (torch.Tensor): Dim of the boxes with the shape of
-                [B, 1, W, H].
-            vel (torch.Tensor): Velocity with the shape of [B, 1, W, H].
-            reg (torch.Tensor, optional): Regression value of the boxes in
-                2D with the shape of [B, 2, W, H]. Default: None.
-            task_id (int, optional): Index of task. Default: -1.
-
-        Returns:
-            list[dict]: Decoded boxes.
-        """
-        batch, cat, _, _ = heat.size()
-
-        scores, inds, clses, ys, xs = self._topk(heat, K=self.max_num)
-
-        if reg is not None:
-            reg = self._transpose_and_gather_feat(reg, inds)
-            reg = reg.view(batch, self.max_num, 2)
-            xs = xs.view(batch, self.max_num, 1) + reg[:, :, 0:1]
-            ys = ys.view(batch, self.max_num, 1) + reg[:, :, 1:2]
-        else:
-            xs = xs.view(batch, self.max_num, 1) + 0.5
-            ys = ys.view(batch, self.max_num, 1) + 0.5
-
-        # rotation value and direction label
-        rot_sine = self._transpose_and_gather_feat(rot_sine, inds)
-        rot_sine = rot_sine.view(batch, self.max_num, 1)
-
-        rot_cosine = self._transpose_and_gather_feat(rot_cosine, inds)
-        rot_cosine = rot_cosine.view(batch, self.max_num, 1)
-        rot = torch.atan2(rot_sine, rot_cosine)
-
-        # height in the bev
-        hei = self._transpose_and_gather_feat(hei, inds)
-        hei = hei.view(batch, self.max_num, 1)
-
-        # dim of the box
-        dim = self._transpose_and_gather_feat(dim, inds)
-        dim = dim.view(batch, self.max_num, 3)
-
-        # class label
-        clses = clses.view(batch, self.max_num).float()
-        scores = scores.view(batch, self.max_num)
-
-        xs = xs.view(
-            batch, self.max_num,
-            1) * self.out_size_factor * self.voxel_size[0] + self.pc_range[0]
-        ys = ys.view(
-            batch, self.max_num,
-            1) * self.out_size_factor * self.voxel_size[1] + self.pc_range[1]
-
-        if vel is None:  # KITTI FORMAT
-            final_box_preds = torch.cat([xs, ys, hei, dim, rot], dim=2)
-        else:  # exist velocity, nuscene format
-            vel = self._transpose_and_gather_feat(vel, inds)
-            vel = vel.view(batch, self.max_num, 2)
-            final_box_preds = torch.cat([xs, ys, hei, dim, rot, vel], dim=2)
-
-        final_scores = scores
-        final_preds = clses
-
-        # use score threshold
-        if self.score_threshold is not None:
-            thresh_mask = final_scores > self.score_threshold
-
-        if self.post_center_range is not None:
-            self.post_center_range = torch.tensor(
-                self.post_center_range, device=heat.device)
-            mask = (final_box_preds[..., :3] >=
-                    self.post_center_range[:3]).all(2)
-            mask &= (final_box_preds[..., :3] <=
-                     self.post_center_range[3:]).all(2)
-
-            predictions_dicts = []
-            for i in range(batch):
-                cmask = mask[i, :]
-                if self.score_threshold:
-                    cmask &= thresh_mask[i]
-
-                boxes3d = final_box_preds[i, cmask]
-                scores = final_scores[i, cmask]
-                labels = final_preds[i, cmask]
-                predictions_dict = {
-                    'bboxes': boxes3d,
-                    'scores': scores,
-                    'labels': labels
-                }
-
-                predictions_dicts.append(predictions_dict)
-        else:
-            raise NotImplementedError(
-                'Need to reorganize output as a batch, only '
-                'support post_center_range is not None for now!')
-
-        return predictions_dicts
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+
+
+@BBOX_CODERS.register_module()
+class CenterPointBBoxCoder(BaseBBoxCoder):
+    """Bbox coder for CenterPoint.
+
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        out_size_factor (int): Downsample factor of the model.
+        voxel_size (list[float]): Size of voxel.
+        post_center_range (list[float], optional): Limit of the center.
+            Default: None.
+        max_num (int, optional): Max number to be kept. Default: 100.
+        score_threshold (float, optional): Threshold to filter boxes
+            based on score. Default: None.
+        code_size (int, optional): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 out_size_factor,
+                 voxel_size,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 code_size=9):
+
+        self.pc_range = pc_range
+        self.out_size_factor = out_size_factor
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.code_size = code_size
+
+    def _gather_feat(self, feats, inds, feat_masks=None):
+        """Given feats and indexes, returns the gathered feats.
+
+        Args:
+            feats (torch.Tensor): Features to be transposed and gathered
+                with the shape of [B, 2, W, H].
+            inds (torch.Tensor): Indexes with the shape of [B, N].
+            feat_masks (torch.Tensor, optional): Mask of the feats.
+                Default: None.
+
+        Returns:
+            torch.Tensor: Gathered feats.
+        """
+        dim = feats.size(2)
+        inds = inds.unsqueeze(2).expand(inds.size(0), inds.size(1), dim)
+        feats = feats.gather(1, inds)
+        if feat_masks is not None:
+            feat_masks = feat_masks.unsqueeze(2).expand_as(feats)
+            feats = feats[feat_masks]
+            feats = feats.view(-1, dim)
+        return feats
+
+    def _topk(self, scores, K=80):
+        """Get indexes based on scores.
+
+        Args:
+            scores (torch.Tensor): scores with the shape of [B, N, W, H].
+            K (int, optional): Number to be kept. Defaults to 80.
+
+        Returns:
+            tuple[torch.Tensor]
+                torch.Tensor: Selected scores with the shape of [B, K].
+                torch.Tensor: Selected indexes with the shape of [B, K].
+                torch.Tensor: Selected classes with the shape of [B, K].
+                torch.Tensor: Selected y coord with the shape of [B, K].
+                torch.Tensor: Selected x coord with the shape of [B, K].
+        """
+        batch, cat, height, width = scores.size()
+
+        topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
+
+        topk_inds = topk_inds % (height * width)
+        topk_ys = (topk_inds.float() /
+                   torch.tensor(width, dtype=torch.float)).int().float()
+        topk_xs = (topk_inds % width).int().float()
+
+        topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
+        topk_clses = (topk_ind / torch.tensor(K, dtype=torch.float)).int()
+        topk_inds = self._gather_feat(topk_inds.view(batch, -1, 1),
+                                      topk_ind).view(batch, K)
+        topk_ys = self._gather_feat(topk_ys.view(batch, -1, 1),
+                                    topk_ind).view(batch, K)
+        topk_xs = self._gather_feat(topk_xs.view(batch, -1, 1),
+                                    topk_ind).view(batch, K)
+
+        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
+
+    def _transpose_and_gather_feat(self, feat, ind):
+        """Given feats and indexes, returns the transposed and gathered feats.
+
+        Args:
+            feat (torch.Tensor): Features to be transposed and gathered
+                with the shape of [B, 2, W, H].
+            ind (torch.Tensor): Indexes with the shape of [B, N].
+
+        Returns:
+            torch.Tensor: Transposed and gathered feats.
+        """
+        feat = feat.permute(0, 2, 3, 1).contiguous()
+        feat = feat.view(feat.size(0), -1, feat.size(3))
+        feat = self._gather_feat(feat, ind)
+        return feat
+
+    def encode(self):
+        pass
+
+    def decode(self,
+               heat,
+               rot_sine,
+               rot_cosine,
+               hei,
+               dim,
+               vel,
+               reg=None,
+               task_id=-1):
+        """Decode bboxes.
+
+        Args:
+            heat (torch.Tensor): Heatmap with the shape of [B, N, W, H].
+            rot_sine (torch.Tensor): Sine of rotation with the shape of
+                [B, 1, W, H].
+            rot_cosine (torch.Tensor): Cosine of rotation with the shape of
+                [B, 1, W, H].
+            hei (torch.Tensor): Height of the boxes with the shape
+                of [B, 1, W, H].
+            dim (torch.Tensor): Dim of the boxes with the shape of
+                [B, 1, W, H].
+            vel (torch.Tensor): Velocity with the shape of [B, 1, W, H].
+            reg (torch.Tensor, optional): Regression value of the boxes in
+                2D with the shape of [B, 2, W, H]. Default: None.
+            task_id (int, optional): Index of task. Default: -1.
+
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        batch, cat, _, _ = heat.size()
+
+        scores, inds, clses, ys, xs = self._topk(heat, K=self.max_num)
+
+        if reg is not None:
+            reg = self._transpose_and_gather_feat(reg, inds)
+            reg = reg.view(batch, self.max_num, 2)
+            xs = xs.view(batch, self.max_num, 1) + reg[:, :, 0:1]
+            ys = ys.view(batch, self.max_num, 1) + reg[:, :, 1:2]
+        else:
+            xs = xs.view(batch, self.max_num, 1) + 0.5
+            ys = ys.view(batch, self.max_num, 1) + 0.5
+
+        # rotation value and direction label
+        rot_sine = self._transpose_and_gather_feat(rot_sine, inds)
+        rot_sine = rot_sine.view(batch, self.max_num, 1)
+
+        rot_cosine = self._transpose_and_gather_feat(rot_cosine, inds)
+        rot_cosine = rot_cosine.view(batch, self.max_num, 1)
+        rot = torch.atan2(rot_sine, rot_cosine)
+
+        # height in the bev
+        hei = self._transpose_and_gather_feat(hei, inds)
+        hei = hei.view(batch, self.max_num, 1)
+
+        # dim of the box
+        dim = self._transpose_and_gather_feat(dim, inds)
+        dim = dim.view(batch, self.max_num, 3)
+
+        # class label
+        clses = clses.view(batch, self.max_num).float()
+        scores = scores.view(batch, self.max_num)
+
+        xs = xs.view(
+            batch, self.max_num,
+            1) * self.out_size_factor * self.voxel_size[0] + self.pc_range[0]
+        ys = ys.view(
+            batch, self.max_num,
+            1) * self.out_size_factor * self.voxel_size[1] + self.pc_range[1]
+
+        if vel is None:  # KITTI FORMAT
+            final_box_preds = torch.cat([xs, ys, hei, dim, rot], dim=2)
+        else:  # exist velocity, nuscene format
+            vel = self._transpose_and_gather_feat(vel, inds)
+            vel = vel.view(batch, self.max_num, 2)
+            final_box_preds = torch.cat([xs, ys, hei, dim, rot, vel], dim=2)
+
+        final_scores = scores
+        final_preds = clses
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=heat.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(2)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(2)
+
+            predictions_dicts = []
+            for i in range(batch):
+                cmask = mask[i, :]
+                if self.score_threshold:
+                    cmask &= thresh_mask[i]
+
+                boxes3d = final_box_preds[i, cmask]
+                scores = final_scores[i, cmask]
+                labels = final_preds[i, cmask]
+                predictions_dict = {
+                    'bboxes': boxes3d,
+                    'scores': scores,
+                    'labels': labels
+                }
+
+                predictions_dicts.append(predictions_dict)
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+
+        return predictions_dicts
diff --git a/mmdet3d/core/bbox/coders/delta_xyzwhlr_bbox_coder.py b/mmdet3d/core/bbox/coders/delta_xyzwhlr_bbox_coder.py
index 931e839..9a1cf67 100644
--- a/mmdet3d/core/bbox/coders/delta_xyzwhlr_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/delta_xyzwhlr_bbox_coder.py
@@ -1,91 +1,91 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet.core.bbox import BaseBBoxCoder
-from mmdet.core.bbox.builder import BBOX_CODERS
-
-
-@BBOX_CODERS.register_module()
-class DeltaXYZWLHRBBoxCoder(BaseBBoxCoder):
-    """Bbox Coder for 3D boxes.
-
-    Args:
-        code_size (int): The dimension of boxes to be encoded.
-    """
-
-    def __init__(self, code_size=7):
-        super(DeltaXYZWLHRBBoxCoder, self).__init__()
-        self.code_size = code_size
-
-    @staticmethod
-    def encode(src_boxes, dst_boxes):
-        """Get box regression transformation deltas (dx, dy, dz, dx_size,
-        dy_size, dz_size, dr, dv*) that can be used to transform the
-        `src_boxes` into the `target_boxes`.
-
-        Args:
-            src_boxes (torch.Tensor): source boxes, e.g., object proposals.
-            dst_boxes (torch.Tensor): target of the transformation, e.g.,
-                ground-truth boxes.
-
-        Returns:
-            torch.Tensor: Box transformation deltas.
-        """
-        box_ndim = src_boxes.shape[-1]
-        cas, cgs, cts = [], [], []
-        if box_ndim > 7:
-            xa, ya, za, wa, la, ha, ra, *cas = torch.split(
-                src_boxes, 1, dim=-1)
-            xg, yg, zg, wg, lg, hg, rg, *cgs = torch.split(
-                dst_boxes, 1, dim=-1)
-            cts = [g - a for g, a in zip(cgs, cas)]
-        else:
-            xa, ya, za, wa, la, ha, ra = torch.split(src_boxes, 1, dim=-1)
-            xg, yg, zg, wg, lg, hg, rg = torch.split(dst_boxes, 1, dim=-1)
-        za = za + ha / 2
-        zg = zg + hg / 2
-        diagonal = torch.sqrt(la**2 + wa**2)
-        xt = (xg - xa) / diagonal
-        yt = (yg - ya) / diagonal
-        zt = (zg - za) / ha
-        lt = torch.log(lg / la)
-        wt = torch.log(wg / wa)
-        ht = torch.log(hg / ha)
-        rt = rg - ra
-        return torch.cat([xt, yt, zt, wt, lt, ht, rt, *cts], dim=-1)
-
-    @staticmethod
-    def decode(anchors, deltas):
-        """Apply transformation `deltas` (dx, dy, dz, dx_size, dy_size,
-        dz_size, dr, dv*) to `boxes`.
-
-        Args:
-            anchors (torch.Tensor): Parameters of anchors with shape (N, 7).
-            deltas (torch.Tensor): Encoded boxes with shape
-                (N, 7+n) [x, y, z, x_size, y_size, z_size, r, velo*].
-
-        Returns:
-            torch.Tensor: Decoded boxes.
-        """
-        cas, cts = [], []
-        box_ndim = anchors.shape[-1]
-        if box_ndim > 7:
-            xa, ya, za, wa, la, ha, ra, *cas = torch.split(anchors, 1, dim=-1)
-            xt, yt, zt, wt, lt, ht, rt, *cts = torch.split(deltas, 1, dim=-1)
-        else:
-            xa, ya, za, wa, la, ha, ra = torch.split(anchors, 1, dim=-1)
-            xt, yt, zt, wt, lt, ht, rt = torch.split(deltas, 1, dim=-1)
-
-        za = za + ha / 2
-        diagonal = torch.sqrt(la**2 + wa**2)
-        xg = xt * diagonal + xa
-        yg = yt * diagonal + ya
-        zg = zt * ha + za
-
-        lg = torch.exp(lt) * la
-        wg = torch.exp(wt) * wa
-        hg = torch.exp(ht) * ha
-        rg = rt + ra
-        zg = zg - hg / 2
-        cgs = [t + a for t, a in zip(cts, cas)]
-        return torch.cat([xg, yg, zg, wg, lg, hg, rg, *cgs], dim=-1)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+
+
+@BBOX_CODERS.register_module()
+class DeltaXYZWLHRBBoxCoder(BaseBBoxCoder):
+    """Bbox Coder for 3D boxes.
+
+    Args:
+        code_size (int): The dimension of boxes to be encoded.
+    """
+
+    def __init__(self, code_size=7):
+        super(DeltaXYZWLHRBBoxCoder, self).__init__()
+        self.code_size = code_size
+
+    @staticmethod
+    def encode(src_boxes, dst_boxes):
+        """Get box regression transformation deltas (dx, dy, dz, dx_size,
+        dy_size, dz_size, dr, dv*) that can be used to transform the
+        `src_boxes` into the `target_boxes`.
+
+        Args:
+            src_boxes (torch.Tensor): source boxes, e.g., object proposals.
+            dst_boxes (torch.Tensor): target of the transformation, e.g.,
+                ground-truth boxes.
+
+        Returns:
+            torch.Tensor: Box transformation deltas.
+        """
+        box_ndim = src_boxes.shape[-1]
+        cas, cgs, cts = [], [], []
+        if box_ndim > 7:
+            xa, ya, za, wa, la, ha, ra, *cas = torch.split(
+                src_boxes, 1, dim=-1)
+            xg, yg, zg, wg, lg, hg, rg, *cgs = torch.split(
+                dst_boxes, 1, dim=-1)
+            cts = [g - a for g, a in zip(cgs, cas)]
+        else:
+            xa, ya, za, wa, la, ha, ra = torch.split(src_boxes, 1, dim=-1)
+            xg, yg, zg, wg, lg, hg, rg = torch.split(dst_boxes, 1, dim=-1)
+        za = za + ha / 2
+        zg = zg + hg / 2
+        diagonal = torch.sqrt(la**2 + wa**2)
+        xt = (xg - xa) / diagonal
+        yt = (yg - ya) / diagonal
+        zt = (zg - za) / ha
+        lt = torch.log(lg / la)
+        wt = torch.log(wg / wa)
+        ht = torch.log(hg / ha)
+        rt = rg - ra
+        return torch.cat([xt, yt, zt, wt, lt, ht, rt, *cts], dim=-1)
+
+    @staticmethod
+    def decode(anchors, deltas):
+        """Apply transformation `deltas` (dx, dy, dz, dx_size, dy_size,
+        dz_size, dr, dv*) to `boxes`.
+
+        Args:
+            anchors (torch.Tensor): Parameters of anchors with shape (N, 7).
+            deltas (torch.Tensor): Encoded boxes with shape
+                (N, 7+n) [x, y, z, x_size, y_size, z_size, r, velo*].
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        cas, cts = [], []
+        box_ndim = anchors.shape[-1]
+        if box_ndim > 7:
+            xa, ya, za, wa, la, ha, ra, *cas = torch.split(anchors, 1, dim=-1)
+            xt, yt, zt, wt, lt, ht, rt, *cts = torch.split(deltas, 1, dim=-1)
+        else:
+            xa, ya, za, wa, la, ha, ra = torch.split(anchors, 1, dim=-1)
+            xt, yt, zt, wt, lt, ht, rt = torch.split(deltas, 1, dim=-1)
+
+        za = za + ha / 2
+        diagonal = torch.sqrt(la**2 + wa**2)
+        xg = xt * diagonal + xa
+        yg = yt * diagonal + ya
+        zg = zt * ha + za
+
+        lg = torch.exp(lt) * la
+        wg = torch.exp(wt) * wa
+        hg = torch.exp(ht) * ha
+        rg = rt + ra
+        zg = zg - hg / 2
+        cgs = [t + a for t, a in zip(cts, cas)]
+        return torch.cat([xg, yg, zg, wg, lg, hg, rg, *cgs], dim=-1)
diff --git a/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py b/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py
index 7cb6b1a..b908e8e 100644
--- a/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py
@@ -1,127 +1,127 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from mmdet.core.bbox import BaseBBoxCoder
-from mmdet.core.bbox.builder import BBOX_CODERS
-from ..structures import limit_period
-
-
-@BBOX_CODERS.register_module()
-class FCOS3DBBoxCoder(BaseBBoxCoder):
-    """Bounding box coder for FCOS3D.
-
-    Args:
-        base_depths (tuple[tuple[float]]): Depth references for decode box
-            depth. Defaults to None.
-        base_dims (tuple[tuple[float]]): Dimension references for decode box
-            dimension. Defaults to None.
-        code_size (int): The dimension of boxes to be encoded. Defaults to 7.
-        norm_on_bbox (bool): Whether to apply normalization on the bounding
-            box 2D attributes. Defaults to True.
-    """
-
-    def __init__(self,
-                 base_depths=None,
-                 base_dims=None,
-                 code_size=7,
-                 norm_on_bbox=True):
-        super(FCOS3DBBoxCoder, self).__init__()
-        self.base_depths = base_depths
-        self.base_dims = base_dims
-        self.bbox_code_size = code_size
-        self.norm_on_bbox = norm_on_bbox
-
-    def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels):
-        # TODO: refactor the encoder in the FCOS3D and PGD head
-        pass
-
-    def decode(self, bbox, scale, stride, training, cls_score=None):
-        """Decode regressed results into 3D predictions.
-
-        Note that offsets are not transformed to the projected 3D centers.
-
-        Args:
-            bbox (torch.Tensor): Raw bounding box predictions in shape
-                [N, C, H, W].
-            scale (tuple[`Scale`]): Learnable scale parameters.
-            stride (int): Stride for a specific feature level.
-            training (bool): Whether the decoding is in the training
-                procedure.
-            cls_score (torch.Tensor): Classification score map for deciding
-                which base depth or dim is used. Defaults to None.
-
-        Returns:
-            torch.Tensor: Decoded boxes.
-        """
-        # scale the bbox of different level
-        # only apply to offset, depth and size prediction
-        scale_offset, scale_depth, scale_size = scale[0:3]
-
-        clone_bbox = bbox.clone()
-        bbox[:, :2] = scale_offset(clone_bbox[:, :2]).float()
-        bbox[:, 2] = scale_depth(clone_bbox[:, 2]).float()
-        bbox[:, 3:6] = scale_size(clone_bbox[:, 3:6]).float()
-
-        if self.base_depths is None:
-            bbox[:, 2] = bbox[:, 2].exp()
-        elif len(self.base_depths) == 1:  # only single prior
-            mean = self.base_depths[0][0]
-            std = self.base_depths[0][1]
-            bbox[:, 2] = mean + bbox.clone()[:, 2] * std
-        else:  # multi-class priors
-            assert len(self.base_depths) == cls_score.shape[1], \
-                'The number of multi-class depth priors should be equal to ' \
-                'the number of categories.'
-            indices = cls_score.max(dim=1)[1]
-            depth_priors = cls_score.new_tensor(
-                self.base_depths)[indices, :].permute(0, 3, 1, 2)
-            mean = depth_priors[:, 0]
-            std = depth_priors[:, 1]
-            bbox[:, 2] = mean + bbox.clone()[:, 2] * std
-
-        bbox[:, 3:6] = bbox[:, 3:6].exp()
-        if self.base_dims is not None:
-            assert len(self.base_dims) == cls_score.shape[1], \
-                'The number of anchor sizes should be equal to the number ' \
-                'of categories.'
-            indices = cls_score.max(dim=1)[1]
-            size_priors = cls_score.new_tensor(
-                self.base_dims)[indices, :].permute(0, 3, 1, 2)
-            bbox[:, 3:6] = size_priors * bbox.clone()[:, 3:6]
-
-        assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\
-            'has not been thoroughly tested for FCOS3D.'
-        if self.norm_on_bbox:
-            if not training:
-                # Note that this line is conducted only when testing
-                bbox[:, :2] *= stride
-
-        return bbox
-
-    @staticmethod
-    def decode_yaw(bbox, centers2d, dir_cls, dir_offset, cam2img):
-        """Decode yaw angle and change it from local to global.i.
-
-        Args:
-            bbox (torch.Tensor): Bounding box predictions in shape
-                [N, C] with yaws to be decoded.
-            centers2d (torch.Tensor): Projected 3D-center on the image planes
-                corresponding to the box predictions.
-            dir_cls (torch.Tensor): Predicted direction classes.
-            dir_offset (float): Direction offset before dividing all the
-                directions into several classes.
-            cam2img (torch.Tensor): Camera intrinsic matrix in shape [4, 4].
-
-        Returns:
-            torch.Tensor: Bounding boxes with decoded yaws.
-        """
-        if bbox.shape[0] > 0:
-            dir_rot = limit_period(bbox[..., 6] - dir_offset, 0, np.pi)
-            bbox[..., 6] = \
-                dir_rot + dir_offset + np.pi * dir_cls.to(bbox.dtype)
-
-        bbox[:, 6] = torch.atan2(centers2d[:, 0] - cam2img[0, 2],
-                                 cam2img[0, 0]) + bbox[:, 6]
-
-        return bbox
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+from ..structures import limit_period
+
+
+@BBOX_CODERS.register_module()
+class FCOS3DBBoxCoder(BaseBBoxCoder):
+    """Bounding box coder for FCOS3D.
+
+    Args:
+        base_depths (tuple[tuple[float]]): Depth references for decode box
+            depth. Defaults to None.
+        base_dims (tuple[tuple[float]]): Dimension references for decode box
+            dimension. Defaults to None.
+        code_size (int): The dimension of boxes to be encoded. Defaults to 7.
+        norm_on_bbox (bool): Whether to apply normalization on the bounding
+            box 2D attributes. Defaults to True.
+    """
+
+    def __init__(self,
+                 base_depths=None,
+                 base_dims=None,
+                 code_size=7,
+                 norm_on_bbox=True):
+        super(FCOS3DBBoxCoder, self).__init__()
+        self.base_depths = base_depths
+        self.base_dims = base_dims
+        self.bbox_code_size = code_size
+        self.norm_on_bbox = norm_on_bbox
+
+    def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels):
+        # TODO: refactor the encoder in the FCOS3D and PGD head
+        pass
+
+    def decode(self, bbox, scale, stride, training, cls_score=None):
+        """Decode regressed results into 3D predictions.
+
+        Note that offsets are not transformed to the projected 3D centers.
+
+        Args:
+            bbox (torch.Tensor): Raw bounding box predictions in shape
+                [N, C, H, W].
+            scale (tuple[`Scale`]): Learnable scale parameters.
+            stride (int): Stride for a specific feature level.
+            training (bool): Whether the decoding is in the training
+                procedure.
+            cls_score (torch.Tensor): Classification score map for deciding
+                which base depth or dim is used. Defaults to None.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        # scale the bbox of different level
+        # only apply to offset, depth and size prediction
+        scale_offset, scale_depth, scale_size = scale[0:3]
+
+        clone_bbox = bbox.clone()
+        bbox[:, :2] = scale_offset(clone_bbox[:, :2]).float()
+        bbox[:, 2] = scale_depth(clone_bbox[:, 2]).float()
+        bbox[:, 3:6] = scale_size(clone_bbox[:, 3:6]).float()
+
+        if self.base_depths is None:
+            bbox[:, 2] = bbox[:, 2].exp()
+        elif len(self.base_depths) == 1:  # only single prior
+            mean = self.base_depths[0][0]
+            std = self.base_depths[0][1]
+            bbox[:, 2] = mean + bbox.clone()[:, 2] * std
+        else:  # multi-class priors
+            assert len(self.base_depths) == cls_score.shape[1], \
+                'The number of multi-class depth priors should be equal to ' \
+                'the number of categories.'
+            indices = cls_score.max(dim=1)[1]
+            depth_priors = cls_score.new_tensor(
+                self.base_depths)[indices, :].permute(0, 3, 1, 2)
+            mean = depth_priors[:, 0]
+            std = depth_priors[:, 1]
+            bbox[:, 2] = mean + bbox.clone()[:, 2] * std
+
+        bbox[:, 3:6] = bbox[:, 3:6].exp()
+        if self.base_dims is not None:
+            assert len(self.base_dims) == cls_score.shape[1], \
+                'The number of anchor sizes should be equal to the number ' \
+                'of categories.'
+            indices = cls_score.max(dim=1)[1]
+            size_priors = cls_score.new_tensor(
+                self.base_dims)[indices, :].permute(0, 3, 1, 2)
+            bbox[:, 3:6] = size_priors * bbox.clone()[:, 3:6]
+
+        assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\
+            'has not been thoroughly tested for FCOS3D.'
+        if self.norm_on_bbox:
+            if not training:
+                # Note that this line is conducted only when testing
+                bbox[:, :2] *= stride
+
+        return bbox
+
+    @staticmethod
+    def decode_yaw(bbox, centers2d, dir_cls, dir_offset, cam2img):
+        """Decode yaw angle and change it from local to global.i.
+
+        Args:
+            bbox (torch.Tensor): Bounding box predictions in shape
+                [N, C] with yaws to be decoded.
+            centers2d (torch.Tensor): Projected 3D-center on the image planes
+                corresponding to the box predictions.
+            dir_cls (torch.Tensor): Predicted direction classes.
+            dir_offset (float): Direction offset before dividing all the
+                directions into several classes.
+            cam2img (torch.Tensor): Camera intrinsic matrix in shape [4, 4].
+
+        Returns:
+            torch.Tensor: Bounding boxes with decoded yaws.
+        """
+        if bbox.shape[0] > 0:
+            dir_rot = limit_period(bbox[..., 6] - dir_offset, 0, np.pi)
+            bbox[..., 6] = \
+                dir_rot + dir_offset + np.pi * dir_cls.to(bbox.dtype)
+
+        bbox[:, 6] = torch.atan2(centers2d[:, 0] - cam2img[0, 2],
+                                 cam2img[0, 0]) + bbox[:, 6]
+
+        return bbox
diff --git a/mmdet3d/core/bbox/coders/groupfree3d_bbox_coder.py b/mmdet3d/core/bbox/coders/groupfree3d_bbox_coder.py
index 08d83e9..366f750 100644
--- a/mmdet3d/core/bbox/coders/groupfree3d_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/groupfree3d_bbox_coder.py
@@ -1,191 +1,191 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from mmdet.core.bbox.builder import BBOX_CODERS
-from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
-
-
-@BBOX_CODERS.register_module()
-class GroupFree3DBBoxCoder(PartialBinBasedBBoxCoder):
-    """Modified partial bin based bbox coder for GroupFree3D.
-
-    Args:
-        num_dir_bins (int): Number of bins to encode direction angle.
-        num_sizes (int): Number of size clusters.
-        mean_sizes (list[list[int]]): Mean size of bboxes in each class.
-        with_rot (bool, optional): Whether the bbox is with rotation.
-            Defaults to True.
-        size_cls_agnostic (bool, optional): Whether the predicted size is
-            class-agnostic. Defaults to True.
-    """
-
-    def __init__(self,
-                 num_dir_bins,
-                 num_sizes,
-                 mean_sizes,
-                 with_rot=True,
-                 size_cls_agnostic=True):
-        super(GroupFree3DBBoxCoder, self).__init__(
-            num_dir_bins=num_dir_bins,
-            num_sizes=num_sizes,
-            mean_sizes=mean_sizes,
-            with_rot=with_rot)
-        self.size_cls_agnostic = size_cls_agnostic
-
-    def encode(self, gt_bboxes_3d, gt_labels_3d):
-        """Encode ground truth to prediction targets.
-
-        Args:
-            gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes
-                with shape (n, 7).
-            gt_labels_3d (torch.Tensor): Ground truth classes.
-
-        Returns:
-            tuple: Targets of center, size and direction.
-        """
-        # generate center target
-        center_target = gt_bboxes_3d.gravity_center
-
-        # generate bbox size target
-        size_target = gt_bboxes_3d.dims
-        size_class_target = gt_labels_3d
-        size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor(
-            self.mean_sizes)[size_class_target]
-
-        # generate dir target
-        box_num = gt_labels_3d.shape[0]
-        if self.with_rot:
-            (dir_class_target,
-             dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
-        else:
-            dir_class_target = gt_labels_3d.new_zeros(box_num)
-            dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
-
-        return (center_target, size_target, size_class_target, size_res_target,
-                dir_class_target, dir_res_target)
-
-    def decode(self, bbox_out, prefix=''):
-        """Decode predicted parts to bbox3d.
-
-        Args:
-            bbox_out (dict): Predictions from model, should contain keys below.
-
-                - center: predicted bottom center of bboxes.
-                - dir_class: predicted bbox direction class.
-                - dir_res: predicted bbox direction residual.
-                - size_class: predicted bbox size class.
-                - size_res: predicted bbox size residual.
-                - size: predicted class-agnostic bbox size
-            prefix (str, optional): Decode predictions with specific prefix.
-                Defaults to ''.
-
-        Returns:
-            torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
-        """
-        center = bbox_out[f'{prefix}center']
-        batch_size, num_proposal = center.shape[:2]
-
-        # decode heading angle
-        if self.with_rot:
-            dir_class = torch.argmax(bbox_out[f'{prefix}dir_class'], -1)
-            dir_res = torch.gather(bbox_out[f'{prefix}dir_res'], 2,
-                                   dir_class.unsqueeze(-1))
-            dir_res.squeeze_(2)
-            dir_angle = self.class2angle(dir_class, dir_res).reshape(
-                batch_size, num_proposal, 1)
-        else:
-            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
-
-        # decode bbox size
-        if self.size_cls_agnostic:
-            bbox_size = bbox_out[f'{prefix}size'].reshape(
-                batch_size, num_proposal, 3)
-        else:
-            size_class = torch.argmax(
-                bbox_out[f'{prefix}size_class'], -1, keepdim=True)
-            size_res = torch.gather(
-                bbox_out[f'{prefix}size_res'], 2,
-                size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
-            mean_sizes = center.new_tensor(self.mean_sizes)
-            size_base = torch.index_select(mean_sizes, 0,
-                                           size_class.reshape(-1))
-            bbox_size = size_base.reshape(batch_size, num_proposal,
-                                          -1) + size_res.squeeze(2)
-
-        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
-        return bbox3d
-
-    def split_pred(self, cls_preds, reg_preds, base_xyz, prefix=''):
-        """Split predicted features to specific parts.
-
-        Args:
-            cls_preds (torch.Tensor): Class predicted features to split.
-            reg_preds (torch.Tensor): Regression predicted features to split.
-            base_xyz (torch.Tensor): Coordinates of points.
-            prefix (str, optional): Decode predictions with specific prefix.
-                Defaults to ''.
-
-        Returns:
-            dict[str, torch.Tensor]: Split results.
-        """
-        results = {}
-        start, end = 0, 0
-
-        cls_preds_trans = cls_preds.transpose(2, 1)
-        reg_preds_trans = reg_preds.transpose(2, 1)
-
-        # decode center
-        end += 3
-        # (batch_size, num_proposal, 3)
-        results[f'{prefix}center_residual'] = \
-            reg_preds_trans[..., start:end].contiguous()
-        results[f'{prefix}center'] = base_xyz + \
-            reg_preds_trans[..., start:end].contiguous()
-        start = end
-
-        # decode direction
-        end += self.num_dir_bins
-        results[f'{prefix}dir_class'] = \
-            reg_preds_trans[..., start:end].contiguous()
-        start = end
-
-        end += self.num_dir_bins
-        dir_res_norm = reg_preds_trans[..., start:end].contiguous()
-        start = end
-
-        results[f'{prefix}dir_res_norm'] = dir_res_norm
-        results[f'{prefix}dir_res'] = dir_res_norm * (
-            np.pi / self.num_dir_bins)
-
-        # decode size
-        if self.size_cls_agnostic:
-            end += 3
-            results[f'{prefix}size'] = \
-                reg_preds_trans[..., start:end].contiguous()
-        else:
-            end += self.num_sizes
-            results[f'{prefix}size_class'] = reg_preds_trans[
-                ..., start:end].contiguous()
-            start = end
-
-            end += self.num_sizes * 3
-            size_res_norm = reg_preds_trans[..., start:end]
-            batch_size, num_proposal = reg_preds_trans.shape[:2]
-            size_res_norm = size_res_norm.view(
-                [batch_size, num_proposal, self.num_sizes, 3])
-            start = end
-
-            results[f'{prefix}size_res_norm'] = size_res_norm.contiguous()
-            mean_sizes = reg_preds.new_tensor(self.mean_sizes)
-            results[f'{prefix}size_res'] = (
-                size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
-
-        # decode objectness score
-        # Group-Free-3D objectness output shape (batch, proposal, 1)
-        results[f'{prefix}obj_scores'] = cls_preds_trans[..., :1].contiguous()
-
-        # decode semantic score
-        results[f'{prefix}sem_scores'] = cls_preds_trans[..., 1:].contiguous()
-
-        return results
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.core.bbox.builder import BBOX_CODERS
+from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class GroupFree3DBBoxCoder(PartialBinBasedBBoxCoder):
+    """Modified partial bin based bbox coder for GroupFree3D.
+
+    Args:
+        num_dir_bins (int): Number of bins to encode direction angle.
+        num_sizes (int): Number of size clusters.
+        mean_sizes (list[list[int]]): Mean size of bboxes in each class.
+        with_rot (bool, optional): Whether the bbox is with rotation.
+            Defaults to True.
+        size_cls_agnostic (bool, optional): Whether the predicted size is
+            class-agnostic. Defaults to True.
+    """
+
+    def __init__(self,
+                 num_dir_bins,
+                 num_sizes,
+                 mean_sizes,
+                 with_rot=True,
+                 size_cls_agnostic=True):
+        super(GroupFree3DBBoxCoder, self).__init__(
+            num_dir_bins=num_dir_bins,
+            num_sizes=num_sizes,
+            mean_sizes=mean_sizes,
+            with_rot=with_rot)
+        self.size_cls_agnostic = size_cls_agnostic
+
+    def encode(self, gt_bboxes_3d, gt_labels_3d):
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes
+                with shape (n, 7).
+            gt_labels_3d (torch.Tensor): Ground truth classes.
+
+        Returns:
+            tuple: Targets of center, size and direction.
+        """
+        # generate center target
+        center_target = gt_bboxes_3d.gravity_center
+
+        # generate bbox size target
+        size_target = gt_bboxes_3d.dims
+        size_class_target = gt_labels_3d
+        size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor(
+            self.mean_sizes)[size_class_target]
+
+        # generate dir target
+        box_num = gt_labels_3d.shape[0]
+        if self.with_rot:
+            (dir_class_target,
+             dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
+        else:
+            dir_class_target = gt_labels_3d.new_zeros(box_num)
+            dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
+
+        return (center_target, size_target, size_class_target, size_res_target,
+                dir_class_target, dir_res_target)
+
+    def decode(self, bbox_out, prefix=''):
+        """Decode predicted parts to bbox3d.
+
+        Args:
+            bbox_out (dict): Predictions from model, should contain keys below.
+
+                - center: predicted bottom center of bboxes.
+                - dir_class: predicted bbox direction class.
+                - dir_res: predicted bbox direction residual.
+                - size_class: predicted bbox size class.
+                - size_res: predicted bbox size residual.
+                - size: predicted class-agnostic bbox size
+            prefix (str, optional): Decode predictions with specific prefix.
+                Defaults to ''.
+
+        Returns:
+            torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
+        """
+        center = bbox_out[f'{prefix}center']
+        batch_size, num_proposal = center.shape[:2]
+
+        # decode heading angle
+        if self.with_rot:
+            dir_class = torch.argmax(bbox_out[f'{prefix}dir_class'], -1)
+            dir_res = torch.gather(bbox_out[f'{prefix}dir_res'], 2,
+                                   dir_class.unsqueeze(-1))
+            dir_res.squeeze_(2)
+            dir_angle = self.class2angle(dir_class, dir_res).reshape(
+                batch_size, num_proposal, 1)
+        else:
+            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
+
+        # decode bbox size
+        if self.size_cls_agnostic:
+            bbox_size = bbox_out[f'{prefix}size'].reshape(
+                batch_size, num_proposal, 3)
+        else:
+            size_class = torch.argmax(
+                bbox_out[f'{prefix}size_class'], -1, keepdim=True)
+            size_res = torch.gather(
+                bbox_out[f'{prefix}size_res'], 2,
+                size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
+            mean_sizes = center.new_tensor(self.mean_sizes)
+            size_base = torch.index_select(mean_sizes, 0,
+                                           size_class.reshape(-1))
+            bbox_size = size_base.reshape(batch_size, num_proposal,
+                                          -1) + size_res.squeeze(2)
+
+        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
+        return bbox3d
+
+    def split_pred(self, cls_preds, reg_preds, base_xyz, prefix=''):
+        """Split predicted features to specific parts.
+
+        Args:
+            cls_preds (torch.Tensor): Class predicted features to split.
+            reg_preds (torch.Tensor): Regression predicted features to split.
+            base_xyz (torch.Tensor): Coordinates of points.
+            prefix (str, optional): Decode predictions with specific prefix.
+                Defaults to ''.
+
+        Returns:
+            dict[str, torch.Tensor]: Split results.
+        """
+        results = {}
+        start, end = 0, 0
+
+        cls_preds_trans = cls_preds.transpose(2, 1)
+        reg_preds_trans = reg_preds.transpose(2, 1)
+
+        # decode center
+        end += 3
+        # (batch_size, num_proposal, 3)
+        results[f'{prefix}center_residual'] = \
+            reg_preds_trans[..., start:end].contiguous()
+        results[f'{prefix}center'] = base_xyz + \
+            reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        # decode direction
+        end += self.num_dir_bins
+        results[f'{prefix}dir_class'] = \
+            reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        end += self.num_dir_bins
+        dir_res_norm = reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        results[f'{prefix}dir_res_norm'] = dir_res_norm
+        results[f'{prefix}dir_res'] = dir_res_norm * (
+            np.pi / self.num_dir_bins)
+
+        # decode size
+        if self.size_cls_agnostic:
+            end += 3
+            results[f'{prefix}size'] = \
+                reg_preds_trans[..., start:end].contiguous()
+        else:
+            end += self.num_sizes
+            results[f'{prefix}size_class'] = reg_preds_trans[
+                ..., start:end].contiguous()
+            start = end
+
+            end += self.num_sizes * 3
+            size_res_norm = reg_preds_trans[..., start:end]
+            batch_size, num_proposal = reg_preds_trans.shape[:2]
+            size_res_norm = size_res_norm.view(
+                [batch_size, num_proposal, self.num_sizes, 3])
+            start = end
+
+            results[f'{prefix}size_res_norm'] = size_res_norm.contiguous()
+            mean_sizes = reg_preds.new_tensor(self.mean_sizes)
+            results[f'{prefix}size_res'] = (
+                size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
+
+        # decode objectness score
+        # Group-Free-3D objectness output shape (batch, proposal, 1)
+        results[f'{prefix}obj_scores'] = cls_preds_trans[..., :1].contiguous()
+
+        # decode semantic score
+        results[f'{prefix}sem_scores'] = cls_preds_trans[..., 1:].contiguous()
+
+        return results
diff --git a/mmdet3d/core/bbox/coders/monoflex_bbox_coder.py b/mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
index e2ada29..a23271e 100644
--- a/mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/monoflex_bbox_coder.py
@@ -1,515 +1,515 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from torch.nn import functional as F
-
-from mmdet.core.bbox import BaseBBoxCoder
-from mmdet.core.bbox.builder import BBOX_CODERS
-
-
-@BBOX_CODERS.register_module()
-class MonoFlexCoder(BaseBBoxCoder):
-    """Bbox Coder for MonoFlex.
-
-    Args:
-        depth_mode (str): The mode for depth calculation.
-            Available options are "linear", "inv_sigmoid", and "exp".
-        base_depth (tuple[float]): References for decoding box depth.
-        depth_range (list): Depth range of predicted depth.
-        combine_depth (bool): Whether to use combined depth (direct depth
-            and depth from keypoints) or use direct depth only.
-        uncertainty_range (list): Uncertainty range of predicted depth.
-        base_dims (tuple[tuple[float]]): Dimensions mean and std of decode bbox
-            dimensions [l, h, w] for each category.
-        dims_mode (str): The mode for dimension calculation.
-            Available options are "linear" and "exp".
-        multibin (bool): Whether to use multibin representation.
-        num_dir_bins (int): Number of Number of bins to encode
-            direction angle.
-        bin_centers (list[float]): Local yaw centers while using multibin
-            representations.
-        bin_margin (float): Margin of multibin representations.
-        code_size (int): The dimension of boxes to be encoded.
-        eps (float, optional): A value added to the denominator for numerical
-            stability. Default 1e-3.
-    """
-
-    def __init__(self,
-                 depth_mode,
-                 base_depth,
-                 depth_range,
-                 combine_depth,
-                 uncertainty_range,
-                 base_dims,
-                 dims_mode,
-                 multibin,
-                 num_dir_bins,
-                 bin_centers,
-                 bin_margin,
-                 code_size,
-                 eps=1e-3):
-        super(MonoFlexCoder, self).__init__()
-
-        # depth related
-        self.depth_mode = depth_mode
-        self.base_depth = base_depth
-        self.depth_range = depth_range
-        self.combine_depth = combine_depth
-        self.uncertainty_range = uncertainty_range
-
-        # dimensions related
-        self.base_dims = base_dims
-        self.dims_mode = dims_mode
-
-        # orientation related
-        self.multibin = multibin
-        self.num_dir_bins = num_dir_bins
-        self.bin_centers = bin_centers
-        self.bin_margin = bin_margin
-
-        # output related
-        self.bbox_code_size = code_size
-        self.eps = eps
-
-    def encode(self, gt_bboxes_3d):
-        """Encode ground truth to prediction targets.
-
-        Args:
-            gt_bboxes_3d (`BaseInstance3DBoxes`): Ground truth 3D bboxes.
-                shape: (N, 7).
-
-        Returns:
-            torch.Tensor: Targets of orientations.
-        """
-        local_yaw = gt_bboxes_3d.local_yaw
-        # encode local yaw (-pi ~ pi) to multibin format
-        encode_local_yaw = local_yaw.new_zeros(
-            [local_yaw.shape[0], self.num_dir_bins * 2])
-        bin_size = 2 * np.pi / self.num_dir_bins
-        margin_size = bin_size * self.bin_margin
-
-        bin_centers = local_yaw.new_tensor(self.bin_centers)
-        range_size = bin_size / 2 + margin_size
-
-        offsets = local_yaw.unsqueeze(1) - bin_centers.unsqueeze(0)
-        offsets[offsets > np.pi] = offsets[offsets > np.pi] - 2 * np.pi
-        offsets[offsets < -np.pi] = offsets[offsets < -np.pi] + 2 * np.pi
-
-        for i in range(self.num_dir_bins):
-            offset = offsets[:, i]
-            inds = abs(offset) < range_size
-            encode_local_yaw[inds, i] = 1
-            encode_local_yaw[inds, i + self.num_dir_bins] = offset[inds]
-
-        orientation_target = encode_local_yaw
-
-        return orientation_target
-
-    def decode(self, bbox, base_centers2d, labels, downsample_ratio, cam2imgs):
-        """Decode bounding box regression into 3D predictions.
-
-        Args:
-            bbox (Tensor): Raw bounding box predictions for each
-                predict center2d point.
-                shape: (N, C)
-            base_centers2d (torch.Tensor): Base centers2d for 3D bboxes.
-                shape: (N, 2).
-            labels (Tensor): Batch predict class label for each predict
-                center2d point.
-                shape: (N, )
-            downsample_ratio (int): The stride of feature map.
-            cam2imgs (Tensor): Batch images' camera intrinsic matrix.
-                shape: kitti (N, 4, 4)  nuscenes (N, 3, 3)
-
-        Return:
-            dict: The 3D prediction dict decoded from regression map.
-            the dict has components below:
-                - bboxes2d (torch.Tensor): Decoded [x1, y1, x2, y2] format
-                    2D bboxes.
-                - dimensions (torch.Tensor): Decoded dimensions for each
-                    object.
-                - offsets2d (torch.Tenosr): Offsets between base centers2d
-                    and real centers2d.
-                - direct_depth (torch.Tensor): Decoded directly regressed
-                    depth.
-                - keypoints2d (torch.Tensor): Keypoints of each projected
-                    3D box on image.
-                - keypoints_depth (torch.Tensor): Decoded depth from keypoints.
-                - combined_depth (torch.Tensor): Combined depth using direct
-                    depth and keypoints depth with depth uncertainty.
-                - orientations (torch.Tensor): Multibin format orientations
-                    (local yaw) for each objects.
-        """
-
-        # 4 dimensions for FCOS style regression
-        pred_bboxes2d = bbox[:, 0:4]
-
-        # change FCOS style to [x1, y1, x2, y2] format for IOU Loss
-        pred_bboxes2d = self.decode_bboxes2d(pred_bboxes2d, base_centers2d)
-
-        # 2 dimensions for projected centers2d offsets
-        pred_offsets2d = bbox[:, 4:6]
-
-        # 3 dimensions for 3D bbox dimensions offsets
-        pred_dimensions_offsets3d = bbox[:, 29:32]
-
-        # the first 8 dimensions are for orientation bin classification
-        # and the second 8 dimensions are for orientation offsets.
-        pred_orientations = torch.cat((bbox[:, 32:40], bbox[:, 40:48]), dim=1)
-
-        # 3 dimensions for the uncertainties of the solved depths from
-        # groups of keypoints
-        pred_keypoints_depth_uncertainty = bbox[:, 26:29]
-
-        # 1 dimension for the uncertainty of directly regressed depth
-        pred_direct_depth_uncertainty = bbox[:, 49:50].squeeze(-1)
-
-        # 2 dimension of offsets x keypoints (8 corners + top/bottom center)
-        pred_keypoints2d = bbox[:, 6:26].reshape(-1, 10, 2)
-
-        # 1 dimension for depth offsets
-        pred_direct_depth_offsets = bbox[:, 48:49].squeeze(-1)
-
-        # decode the pred residual dimensions to real dimensions
-        pred_dimensions = self.decode_dims(labels, pred_dimensions_offsets3d)
-        pred_direct_depth = self.decode_direct_depth(pred_direct_depth_offsets)
-        pred_keypoints_depth = self.keypoints2depth(pred_keypoints2d,
-                                                    pred_dimensions, cam2imgs,
-                                                    downsample_ratio)
-
-        pred_direct_depth_uncertainty = torch.clamp(
-            pred_direct_depth_uncertainty, self.uncertainty_range[0],
-            self.uncertainty_range[1])
-        pred_keypoints_depth_uncertainty = torch.clamp(
-            pred_keypoints_depth_uncertainty, self.uncertainty_range[0],
-            self.uncertainty_range[1])
-
-        if self.combine_depth:
-            pred_depth_uncertainty = torch.cat(
-                (pred_direct_depth_uncertainty.unsqueeze(-1),
-                 pred_keypoints_depth_uncertainty),
-                dim=1).exp()
-            pred_depth = torch.cat(
-                (pred_direct_depth.unsqueeze(-1), pred_keypoints_depth), dim=1)
-            pred_combined_depth = \
-                self.combine_depths(pred_depth, pred_depth_uncertainty)
-        else:
-            pred_combined_depth = None
-
-        preds = dict(
-            bboxes2d=pred_bboxes2d,
-            dimensions=pred_dimensions,
-            offsets2d=pred_offsets2d,
-            keypoints2d=pred_keypoints2d,
-            orientations=pred_orientations,
-            direct_depth=pred_direct_depth,
-            keypoints_depth=pred_keypoints_depth,
-            combined_depth=pred_combined_depth,
-            direct_depth_uncertainty=pred_direct_depth_uncertainty,
-            keypoints_depth_uncertainty=pred_keypoints_depth_uncertainty,
-        )
-
-        return preds
-
-    def decode_direct_depth(self, depth_offsets):
-        """Transform depth offset to directly regressed depth.
-
-        Args:
-            depth_offsets (torch.Tensor): Predicted depth offsets.
-                shape: (N, )
-
-        Return:
-            torch.Tensor: Directly regressed depth.
-                shape: (N, )
-        """
-        if self.depth_mode == 'exp':
-            direct_depth = depth_offsets.exp()
-        elif self.depth_mode == 'linear':
-            base_depth = depth_offsets.new_tensor(self.base_depth)
-            direct_depth = depth_offsets * base_depth[1] + base_depth[0]
-        elif self.depth_mode == 'inv_sigmoid':
-            direct_depth = 1 / torch.sigmoid(depth_offsets) - 1
-        else:
-            raise ValueError
-
-        if self.depth_range is not None:
-            direct_depth = torch.clamp(
-                direct_depth, min=self.depth_range[0], max=self.depth_range[1])
-
-        return direct_depth
-
-    def decode_location(self,
-                        base_centers2d,
-                        offsets2d,
-                        depths,
-                        cam2imgs,
-                        downsample_ratio,
-                        pad_mode='default'):
-        """Retrieve object location.
-
-        Args:
-            base_centers2d (torch.Tensor): predicted base centers2d.
-                shape: (N, 2)
-            offsets2d (torch.Tensor): The offsets between real centers2d
-                and base centers2d.
-                shape: (N , 2)
-            depths (torch.Tensor): Depths of objects.
-                shape: (N, )
-            cam2imgs (torch.Tensor): Batch images' camera intrinsic matrix.
-                shape: kitti (N, 4, 4)  nuscenes (N, 3, 3)
-            downsample_ratio (int): The stride of feature map.
-            pad_mode (str, optional): Padding mode used in
-                training data augmentation.
-
-        Return:
-            tuple(torch.Tensor): Centers of 3D boxes.
-                shape: (N, 3)
-        """
-        N = cam2imgs.shape[0]
-        # (N, 4, 4)
-        cam2imgs_inv = cam2imgs.inverse()
-        if pad_mode == 'default':
-            centers2d_img = (base_centers2d + offsets2d) * downsample_ratio
-        else:
-            raise NotImplementedError
-        # (N, 3)
-        centers2d_img = \
-            torch.cat((centers2d_img, depths.unsqueeze(-1)), dim=1)
-        # (N, 4, 1)
-        centers2d_extend = \
-            torch.cat((centers2d_img, centers2d_img.new_ones(N, 1)),
-                      dim=1).unsqueeze(-1)
-        locations = torch.matmul(cam2imgs_inv, centers2d_extend).squeeze(-1)
-
-        return locations[:, :3]
-
-    def keypoints2depth(self,
-                        keypoints2d,
-                        dimensions,
-                        cam2imgs,
-                        downsample_ratio=4,
-                        group0_index=[(7, 3), (0, 4)],
-                        group1_index=[(2, 6), (1, 5)]):
-        """Decode depth form three groups of keypoints and geometry projection
-        model. 2D keypoints inlucding 8 coreners and top/bottom centers will be
-        divided into three groups which will be used to calculate three depths
-        of object.
-
-        .. code-block:: none
-
-                Group center keypoints:
-
-                             + --------------- +
-                            /|   top center   /|
-                           / |      .        / |
-                          /  |      |       /  |
-                         + ---------|----- +   +
-                         |  /       |      |  /
-                         | /        .      | /
-                         |/ bottom center  |/
-                         + --------------- +
-
-                Group 0 keypoints:
-
-                             0
-                             + -------------- +
-                            /|               /|
-                           / |              / |
-                          /  |            5/  |
-                         + -------------- +   +
-                         |  /3            |  /
-                         | /              | /
-                         |/               |/
-                         + -------------- + 6
-
-                Group 1 keypoints:
-
-                                               4
-                             + -------------- +
-                            /|               /|
-                           / |              / |
-                          /  |             /  |
-                       1 + -------------- +   + 7
-                         |  /             |  /
-                         | /              | /
-                         |/               |/
-                       2 + -------------- +
-
-
-        Args:
-            keypoints2d (torch.Tensor): Keypoints of objects.
-                8 vertices + top/bottom center.
-                shape: (N, 10, 2)
-            dimensions (torch.Tensor): Dimensions of objetcts.
-                shape: (N, 3)
-            cam2imgs (torch.Tensor): Batch images' camera intrinsic matrix.
-                shape: kitti (N, 4, 4)  nuscenes (N, 3, 3)
-            downsample_ratio (int, opitonal): The stride of feature map.
-                Defaults: 4.
-            group0_index(list[tuple[int]], optional): Keypoints group 0
-                of index to calculate the depth.
-                Defaults: [0, 3, 4, 7].
-            group1_index(list[tuple[int]], optional): Keypoints group 1
-                of index to calculate the depth.
-                Defaults: [1, 2, 5, 6]
-
-        Return:
-            tuple(torch.Tensor): Depth computed from three groups of
-                keypoints (top/bottom, group0, group1)
-                shape: (N, 3)
-        """
-
-        pred_height_3d = dimensions[:, 1].clone()
-        f_u = cam2imgs[:, 0, 0]
-        center_height = keypoints2d[:, -2, 1] - keypoints2d[:, -1, 1]
-        corner_group0_height = keypoints2d[:, group0_index[0], 1] \
-            - keypoints2d[:, group0_index[1], 1]
-        corner_group1_height = keypoints2d[:, group1_index[0], 1] \
-            - keypoints2d[:, group1_index[1], 1]
-        center_depth = f_u * pred_height_3d / (
-            F.relu(center_height) * downsample_ratio + self.eps)
-        corner_group0_depth = (f_u * pred_height_3d).unsqueeze(-1) / (
-            F.relu(corner_group0_height) * downsample_ratio + self.eps)
-        corner_group1_depth = (f_u * pred_height_3d).unsqueeze(-1) / (
-            F.relu(corner_group1_height) * downsample_ratio + self.eps)
-
-        corner_group0_depth = corner_group0_depth.mean(dim=1)
-        corner_group1_depth = corner_group1_depth.mean(dim=1)
-
-        keypoints_depth = torch.stack(
-            (center_depth, corner_group0_depth, corner_group1_depth), dim=1)
-        keypoints_depth = torch.clamp(
-            keypoints_depth, min=self.depth_range[0], max=self.depth_range[1])
-
-        return keypoints_depth
-
-    def decode_dims(self, labels, dims_offset):
-        """Retrieve object dimensions.
-
-        Args:
-            labels (torch.Tensor): Each points' category id.
-                shape: (N, K)
-            dims_offset (torch.Tensor): Dimension offsets.
-                shape: (N, 3)
-
-        Returns:
-            torch.Tensor: Shape (N, 3)
-        """
-
-        if self.dims_mode == 'exp':
-            dims_offset = dims_offset.exp()
-        elif self.dims_mode == 'linear':
-            labels = labels.long()
-            base_dims = dims_offset.new_tensor(self.base_dims)
-            dims_mean = base_dims[:, :3]
-            dims_std = base_dims[:, 3:6]
-            cls_dimension_mean = dims_mean[labels, :]
-            cls_dimension_std = dims_std[labels, :]
-            dimensions = dims_offset * cls_dimension_mean + cls_dimension_std
-        else:
-            raise ValueError
-
-        return dimensions
-
-    def decode_orientation(self, ori_vector, locations):
-        """Retrieve object orientation.
-
-        Args:
-            ori_vector (torch.Tensor): Local orientation vector
-                in [axis_cls, head_cls, sin, cos] format.
-                shape: (N, num_dir_bins * 4)
-            locations (torch.Tensor): Object location.
-                shape: (N, 3)
-
-        Returns:
-            tuple[torch.Tensor]: yaws and local yaws of 3d bboxes.
-        """
-        if self.multibin:
-            pred_bin_cls = ori_vector[:, :self.num_dir_bins * 2].view(
-                -1, self.num_dir_bins, 2)
-            pred_bin_cls = pred_bin_cls.softmax(dim=2)[..., 1]
-            orientations = ori_vector.new_zeros(ori_vector.shape[0])
-            for i in range(self.num_dir_bins):
-                mask_i = (pred_bin_cls.argmax(dim=1) == i)
-                start_bin = self.num_dir_bins * 2 + i * 2
-                end_bin = start_bin + 2
-                pred_bin_offset = ori_vector[mask_i, start_bin:end_bin]
-                orientations[mask_i] = pred_bin_offset[:, 0].atan2(
-                    pred_bin_offset[:, 1]) + self.bin_centers[i]
-        else:
-            axis_cls = ori_vector[:, :2].softmax(dim=1)
-            axis_cls = axis_cls[:, 0] < axis_cls[:, 1]
-            head_cls = ori_vector[:, 2:4].softmax(dim=1)
-            head_cls = head_cls[:, 0] < head_cls[:, 1]
-            # cls axis
-            orientations = self.bin_centers[axis_cls + head_cls * 2]
-            sin_cos_offset = F.normalize(ori_vector[:, 4:])
-            orientations += sin_cos_offset[:, 0].atan(sin_cos_offset[:, 1])
-
-        locations = locations.view(-1, 3)
-        rays = locations[:, 0].atan2(locations[:, 2])
-        local_yaws = orientations
-        yaws = local_yaws + rays
-
-        larger_idx = (yaws > np.pi).nonzero(as_tuple=False)
-        small_idx = (yaws < -np.pi).nonzero(as_tuple=False)
-        if len(larger_idx) != 0:
-            yaws[larger_idx] -= 2 * np.pi
-        if len(small_idx) != 0:
-            yaws[small_idx] += 2 * np.pi
-
-        larger_idx = (local_yaws > np.pi).nonzero(as_tuple=False)
-        small_idx = (local_yaws < -np.pi).nonzero(as_tuple=False)
-        if len(larger_idx) != 0:
-            local_yaws[larger_idx] -= 2 * np.pi
-        if len(small_idx) != 0:
-            local_yaws[small_idx] += 2 * np.pi
-
-        return yaws, local_yaws
-
-    def decode_bboxes2d(self, reg_bboxes2d, base_centers2d):
-        """Retrieve [x1, y1, x2, y2] format 2D bboxes.
-
-        Args:
-            reg_bboxes2d (torch.Tensor): Predicted FCOS style
-                2D bboxes.
-                shape: (N, 4)
-            base_centers2d (torch.Tensor): predicted base centers2d.
-                shape: (N, 2)
-
-        Returns:
-            torch.Tenosr: [x1, y1, x2, y2] format 2D bboxes.
-        """
-        centers_x = base_centers2d[:, 0]
-        centers_y = base_centers2d[:, 1]
-
-        xs_min = centers_x - reg_bboxes2d[..., 0]
-        ys_min = centers_y - reg_bboxes2d[..., 1]
-        xs_max = centers_x + reg_bboxes2d[..., 2]
-        ys_max = centers_y + reg_bboxes2d[..., 3]
-
-        bboxes2d = torch.stack([xs_min, ys_min, xs_max, ys_max], dim=-1)
-
-        return bboxes2d
-
-    def combine_depths(self, depth, depth_uncertainty):
-        """Combine all the prediced depths with depth uncertainty.
-
-        Args:
-            depth (torch.Tensor): Predicted depths of each object.
-                2D bboxes.
-                shape: (N, 4)
-            depth_uncertainty (torch.Tensor): Depth uncertainty for
-                each depth of each object.
-                shape: (N, 4)
-
-        Returns:
-            torch.Tenosr: combined depth.
-        """
-        uncertainty_weights = 1 / depth_uncertainty
-        uncertainty_weights = \
-            uncertainty_weights / \
-            uncertainty_weights.sum(dim=1, keepdim=True)
-        combined_depth = torch.sum(depth * uncertainty_weights, dim=1)
-
-        return combined_depth
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+
+
+@BBOX_CODERS.register_module()
+class MonoFlexCoder(BaseBBoxCoder):
+    """Bbox Coder for MonoFlex.
+
+    Args:
+        depth_mode (str): The mode for depth calculation.
+            Available options are "linear", "inv_sigmoid", and "exp".
+        base_depth (tuple[float]): References for decoding box depth.
+        depth_range (list): Depth range of predicted depth.
+        combine_depth (bool): Whether to use combined depth (direct depth
+            and depth from keypoints) or use direct depth only.
+        uncertainty_range (list): Uncertainty range of predicted depth.
+        base_dims (tuple[tuple[float]]): Dimensions mean and std of decode bbox
+            dimensions [l, h, w] for each category.
+        dims_mode (str): The mode for dimension calculation.
+            Available options are "linear" and "exp".
+        multibin (bool): Whether to use multibin representation.
+        num_dir_bins (int): Number of Number of bins to encode
+            direction angle.
+        bin_centers (list[float]): Local yaw centers while using multibin
+            representations.
+        bin_margin (float): Margin of multibin representations.
+        code_size (int): The dimension of boxes to be encoded.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-3.
+    """
+
+    def __init__(self,
+                 depth_mode,
+                 base_depth,
+                 depth_range,
+                 combine_depth,
+                 uncertainty_range,
+                 base_dims,
+                 dims_mode,
+                 multibin,
+                 num_dir_bins,
+                 bin_centers,
+                 bin_margin,
+                 code_size,
+                 eps=1e-3):
+        super(MonoFlexCoder, self).__init__()
+
+        # depth related
+        self.depth_mode = depth_mode
+        self.base_depth = base_depth
+        self.depth_range = depth_range
+        self.combine_depth = combine_depth
+        self.uncertainty_range = uncertainty_range
+
+        # dimensions related
+        self.base_dims = base_dims
+        self.dims_mode = dims_mode
+
+        # orientation related
+        self.multibin = multibin
+        self.num_dir_bins = num_dir_bins
+        self.bin_centers = bin_centers
+        self.bin_margin = bin_margin
+
+        # output related
+        self.bbox_code_size = code_size
+        self.eps = eps
+
+    def encode(self, gt_bboxes_3d):
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (`BaseInstance3DBoxes`): Ground truth 3D bboxes.
+                shape: (N, 7).
+
+        Returns:
+            torch.Tensor: Targets of orientations.
+        """
+        local_yaw = gt_bboxes_3d.local_yaw
+        # encode local yaw (-pi ~ pi) to multibin format
+        encode_local_yaw = local_yaw.new_zeros(
+            [local_yaw.shape[0], self.num_dir_bins * 2])
+        bin_size = 2 * np.pi / self.num_dir_bins
+        margin_size = bin_size * self.bin_margin
+
+        bin_centers = local_yaw.new_tensor(self.bin_centers)
+        range_size = bin_size / 2 + margin_size
+
+        offsets = local_yaw.unsqueeze(1) - bin_centers.unsqueeze(0)
+        offsets[offsets > np.pi] = offsets[offsets > np.pi] - 2 * np.pi
+        offsets[offsets < -np.pi] = offsets[offsets < -np.pi] + 2 * np.pi
+
+        for i in range(self.num_dir_bins):
+            offset = offsets[:, i]
+            inds = abs(offset) < range_size
+            encode_local_yaw[inds, i] = 1
+            encode_local_yaw[inds, i + self.num_dir_bins] = offset[inds]
+
+        orientation_target = encode_local_yaw
+
+        return orientation_target
+
+    def decode(self, bbox, base_centers2d, labels, downsample_ratio, cam2imgs):
+        """Decode bounding box regression into 3D predictions.
+
+        Args:
+            bbox (Tensor): Raw bounding box predictions for each
+                predict center2d point.
+                shape: (N, C)
+            base_centers2d (torch.Tensor): Base centers2d for 3D bboxes.
+                shape: (N, 2).
+            labels (Tensor): Batch predict class label for each predict
+                center2d point.
+                shape: (N, )
+            downsample_ratio (int): The stride of feature map.
+            cam2imgs (Tensor): Batch images' camera intrinsic matrix.
+                shape: kitti (N, 4, 4)  nuscenes (N, 3, 3)
+
+        Return:
+            dict: The 3D prediction dict decoded from regression map.
+            the dict has components below:
+                - bboxes2d (torch.Tensor): Decoded [x1, y1, x2, y2] format
+                    2D bboxes.
+                - dimensions (torch.Tensor): Decoded dimensions for each
+                    object.
+                - offsets2d (torch.Tenosr): Offsets between base centers2d
+                    and real centers2d.
+                - direct_depth (torch.Tensor): Decoded directly regressed
+                    depth.
+                - keypoints2d (torch.Tensor): Keypoints of each projected
+                    3D box on image.
+                - keypoints_depth (torch.Tensor): Decoded depth from keypoints.
+                - combined_depth (torch.Tensor): Combined depth using direct
+                    depth and keypoints depth with depth uncertainty.
+                - orientations (torch.Tensor): Multibin format orientations
+                    (local yaw) for each objects.
+        """
+
+        # 4 dimensions for FCOS style regression
+        pred_bboxes2d = bbox[:, 0:4]
+
+        # change FCOS style to [x1, y1, x2, y2] format for IOU Loss
+        pred_bboxes2d = self.decode_bboxes2d(pred_bboxes2d, base_centers2d)
+
+        # 2 dimensions for projected centers2d offsets
+        pred_offsets2d = bbox[:, 4:6]
+
+        # 3 dimensions for 3D bbox dimensions offsets
+        pred_dimensions_offsets3d = bbox[:, 29:32]
+
+        # the first 8 dimensions are for orientation bin classification
+        # and the second 8 dimensions are for orientation offsets.
+        pred_orientations = torch.cat((bbox[:, 32:40], bbox[:, 40:48]), dim=1)
+
+        # 3 dimensions for the uncertainties of the solved depths from
+        # groups of keypoints
+        pred_keypoints_depth_uncertainty = bbox[:, 26:29]
+
+        # 1 dimension for the uncertainty of directly regressed depth
+        pred_direct_depth_uncertainty = bbox[:, 49:50].squeeze(-1)
+
+        # 2 dimension of offsets x keypoints (8 corners + top/bottom center)
+        pred_keypoints2d = bbox[:, 6:26].reshape(-1, 10, 2)
+
+        # 1 dimension for depth offsets
+        pred_direct_depth_offsets = bbox[:, 48:49].squeeze(-1)
+
+        # decode the pred residual dimensions to real dimensions
+        pred_dimensions = self.decode_dims(labels, pred_dimensions_offsets3d)
+        pred_direct_depth = self.decode_direct_depth(pred_direct_depth_offsets)
+        pred_keypoints_depth = self.keypoints2depth(pred_keypoints2d,
+                                                    pred_dimensions, cam2imgs,
+                                                    downsample_ratio)
+
+        pred_direct_depth_uncertainty = torch.clamp(
+            pred_direct_depth_uncertainty, self.uncertainty_range[0],
+            self.uncertainty_range[1])
+        pred_keypoints_depth_uncertainty = torch.clamp(
+            pred_keypoints_depth_uncertainty, self.uncertainty_range[0],
+            self.uncertainty_range[1])
+
+        if self.combine_depth:
+            pred_depth_uncertainty = torch.cat(
+                (pred_direct_depth_uncertainty.unsqueeze(-1),
+                 pred_keypoints_depth_uncertainty),
+                dim=1).exp()
+            pred_depth = torch.cat(
+                (pred_direct_depth.unsqueeze(-1), pred_keypoints_depth), dim=1)
+            pred_combined_depth = \
+                self.combine_depths(pred_depth, pred_depth_uncertainty)
+        else:
+            pred_combined_depth = None
+
+        preds = dict(
+            bboxes2d=pred_bboxes2d,
+            dimensions=pred_dimensions,
+            offsets2d=pred_offsets2d,
+            keypoints2d=pred_keypoints2d,
+            orientations=pred_orientations,
+            direct_depth=pred_direct_depth,
+            keypoints_depth=pred_keypoints_depth,
+            combined_depth=pred_combined_depth,
+            direct_depth_uncertainty=pred_direct_depth_uncertainty,
+            keypoints_depth_uncertainty=pred_keypoints_depth_uncertainty,
+        )
+
+        return preds
+
+    def decode_direct_depth(self, depth_offsets):
+        """Transform depth offset to directly regressed depth.
+
+        Args:
+            depth_offsets (torch.Tensor): Predicted depth offsets.
+                shape: (N, )
+
+        Return:
+            torch.Tensor: Directly regressed depth.
+                shape: (N, )
+        """
+        if self.depth_mode == 'exp':
+            direct_depth = depth_offsets.exp()
+        elif self.depth_mode == 'linear':
+            base_depth = depth_offsets.new_tensor(self.base_depth)
+            direct_depth = depth_offsets * base_depth[1] + base_depth[0]
+        elif self.depth_mode == 'inv_sigmoid':
+            direct_depth = 1 / torch.sigmoid(depth_offsets) - 1
+        else:
+            raise ValueError
+
+        if self.depth_range is not None:
+            direct_depth = torch.clamp(
+                direct_depth, min=self.depth_range[0], max=self.depth_range[1])
+
+        return direct_depth
+
+    def decode_location(self,
+                        base_centers2d,
+                        offsets2d,
+                        depths,
+                        cam2imgs,
+                        downsample_ratio,
+                        pad_mode='default'):
+        """Retrieve object location.
+
+        Args:
+            base_centers2d (torch.Tensor): predicted base centers2d.
+                shape: (N, 2)
+            offsets2d (torch.Tensor): The offsets between real centers2d
+                and base centers2d.
+                shape: (N , 2)
+            depths (torch.Tensor): Depths of objects.
+                shape: (N, )
+            cam2imgs (torch.Tensor): Batch images' camera intrinsic matrix.
+                shape: kitti (N, 4, 4)  nuscenes (N, 3, 3)
+            downsample_ratio (int): The stride of feature map.
+            pad_mode (str, optional): Padding mode used in
+                training data augmentation.
+
+        Return:
+            tuple(torch.Tensor): Centers of 3D boxes.
+                shape: (N, 3)
+        """
+        N = cam2imgs.shape[0]
+        # (N, 4, 4)
+        cam2imgs_inv = cam2imgs.inverse()
+        if pad_mode == 'default':
+            centers2d_img = (base_centers2d + offsets2d) * downsample_ratio
+        else:
+            raise NotImplementedError
+        # (N, 3)
+        centers2d_img = \
+            torch.cat((centers2d_img, depths.unsqueeze(-1)), dim=1)
+        # (N, 4, 1)
+        centers2d_extend = \
+            torch.cat((centers2d_img, centers2d_img.new_ones(N, 1)),
+                      dim=1).unsqueeze(-1)
+        locations = torch.matmul(cam2imgs_inv, centers2d_extend).squeeze(-1)
+
+        return locations[:, :3]
+
+    def keypoints2depth(self,
+                        keypoints2d,
+                        dimensions,
+                        cam2imgs,
+                        downsample_ratio=4,
+                        group0_index=[(7, 3), (0, 4)],
+                        group1_index=[(2, 6), (1, 5)]):
+        """Decode depth form three groups of keypoints and geometry projection
+        model. 2D keypoints inlucding 8 coreners and top/bottom centers will be
+        divided into three groups which will be used to calculate three depths
+        of object.
+
+        .. code-block:: none
+
+                Group center keypoints:
+
+                             + --------------- +
+                            /|   top center   /|
+                           / |      .        / |
+                          /  |      |       /  |
+                         + ---------|----- +   +
+                         |  /       |      |  /
+                         | /        .      | /
+                         |/ bottom center  |/
+                         + --------------- +
+
+                Group 0 keypoints:
+
+                             0
+                             + -------------- +
+                            /|               /|
+                           / |              / |
+                          /  |            5/  |
+                         + -------------- +   +
+                         |  /3            |  /
+                         | /              | /
+                         |/               |/
+                         + -------------- + 6
+
+                Group 1 keypoints:
+
+                                               4
+                             + -------------- +
+                            /|               /|
+                           / |              / |
+                          /  |             /  |
+                       1 + -------------- +   + 7
+                         |  /             |  /
+                         | /              | /
+                         |/               |/
+                       2 + -------------- +
+
+
+        Args:
+            keypoints2d (torch.Tensor): Keypoints of objects.
+                8 vertices + top/bottom center.
+                shape: (N, 10, 2)
+            dimensions (torch.Tensor): Dimensions of objetcts.
+                shape: (N, 3)
+            cam2imgs (torch.Tensor): Batch images' camera intrinsic matrix.
+                shape: kitti (N, 4, 4)  nuscenes (N, 3, 3)
+            downsample_ratio (int, opitonal): The stride of feature map.
+                Defaults: 4.
+            group0_index(list[tuple[int]], optional): Keypoints group 0
+                of index to calculate the depth.
+                Defaults: [0, 3, 4, 7].
+            group1_index(list[tuple[int]], optional): Keypoints group 1
+                of index to calculate the depth.
+                Defaults: [1, 2, 5, 6]
+
+        Return:
+            tuple(torch.Tensor): Depth computed from three groups of
+                keypoints (top/bottom, group0, group1)
+                shape: (N, 3)
+        """
+
+        pred_height_3d = dimensions[:, 1].clone()
+        f_u = cam2imgs[:, 0, 0]
+        center_height = keypoints2d[:, -2, 1] - keypoints2d[:, -1, 1]
+        corner_group0_height = keypoints2d[:, group0_index[0], 1] \
+            - keypoints2d[:, group0_index[1], 1]
+        corner_group1_height = keypoints2d[:, group1_index[0], 1] \
+            - keypoints2d[:, group1_index[1], 1]
+        center_depth = f_u * pred_height_3d / (
+            F.relu(center_height) * downsample_ratio + self.eps)
+        corner_group0_depth = (f_u * pred_height_3d).unsqueeze(-1) / (
+            F.relu(corner_group0_height) * downsample_ratio + self.eps)
+        corner_group1_depth = (f_u * pred_height_3d).unsqueeze(-1) / (
+            F.relu(corner_group1_height) * downsample_ratio + self.eps)
+
+        corner_group0_depth = corner_group0_depth.mean(dim=1)
+        corner_group1_depth = corner_group1_depth.mean(dim=1)
+
+        keypoints_depth = torch.stack(
+            (center_depth, corner_group0_depth, corner_group1_depth), dim=1)
+        keypoints_depth = torch.clamp(
+            keypoints_depth, min=self.depth_range[0], max=self.depth_range[1])
+
+        return keypoints_depth
+
+    def decode_dims(self, labels, dims_offset):
+        """Retrieve object dimensions.
+
+        Args:
+            labels (torch.Tensor): Each points' category id.
+                shape: (N, K)
+            dims_offset (torch.Tensor): Dimension offsets.
+                shape: (N, 3)
+
+        Returns:
+            torch.Tensor: Shape (N, 3)
+        """
+
+        if self.dims_mode == 'exp':
+            dims_offset = dims_offset.exp()
+        elif self.dims_mode == 'linear':
+            labels = labels.long()
+            base_dims = dims_offset.new_tensor(self.base_dims)
+            dims_mean = base_dims[:, :3]
+            dims_std = base_dims[:, 3:6]
+            cls_dimension_mean = dims_mean[labels, :]
+            cls_dimension_std = dims_std[labels, :]
+            dimensions = dims_offset * cls_dimension_mean + cls_dimension_std
+        else:
+            raise ValueError
+
+        return dimensions
+
+    def decode_orientation(self, ori_vector, locations):
+        """Retrieve object orientation.
+
+        Args:
+            ori_vector (torch.Tensor): Local orientation vector
+                in [axis_cls, head_cls, sin, cos] format.
+                shape: (N, num_dir_bins * 4)
+            locations (torch.Tensor): Object location.
+                shape: (N, 3)
+
+        Returns:
+            tuple[torch.Tensor]: yaws and local yaws of 3d bboxes.
+        """
+        if self.multibin:
+            pred_bin_cls = ori_vector[:, :self.num_dir_bins * 2].view(
+                -1, self.num_dir_bins, 2)
+            pred_bin_cls = pred_bin_cls.softmax(dim=2)[..., 1]
+            orientations = ori_vector.new_zeros(ori_vector.shape[0])
+            for i in range(self.num_dir_bins):
+                mask_i = (pred_bin_cls.argmax(dim=1) == i)
+                start_bin = self.num_dir_bins * 2 + i * 2
+                end_bin = start_bin + 2
+                pred_bin_offset = ori_vector[mask_i, start_bin:end_bin]
+                orientations[mask_i] = pred_bin_offset[:, 0].atan2(
+                    pred_bin_offset[:, 1]) + self.bin_centers[i]
+        else:
+            axis_cls = ori_vector[:, :2].softmax(dim=1)
+            axis_cls = axis_cls[:, 0] < axis_cls[:, 1]
+            head_cls = ori_vector[:, 2:4].softmax(dim=1)
+            head_cls = head_cls[:, 0] < head_cls[:, 1]
+            # cls axis
+            orientations = self.bin_centers[axis_cls + head_cls * 2]
+            sin_cos_offset = F.normalize(ori_vector[:, 4:])
+            orientations += sin_cos_offset[:, 0].atan(sin_cos_offset[:, 1])
+
+        locations = locations.view(-1, 3)
+        rays = locations[:, 0].atan2(locations[:, 2])
+        local_yaws = orientations
+        yaws = local_yaws + rays
+
+        larger_idx = (yaws > np.pi).nonzero(as_tuple=False)
+        small_idx = (yaws < -np.pi).nonzero(as_tuple=False)
+        if len(larger_idx) != 0:
+            yaws[larger_idx] -= 2 * np.pi
+        if len(small_idx) != 0:
+            yaws[small_idx] += 2 * np.pi
+
+        larger_idx = (local_yaws > np.pi).nonzero(as_tuple=False)
+        small_idx = (local_yaws < -np.pi).nonzero(as_tuple=False)
+        if len(larger_idx) != 0:
+            local_yaws[larger_idx] -= 2 * np.pi
+        if len(small_idx) != 0:
+            local_yaws[small_idx] += 2 * np.pi
+
+        return yaws, local_yaws
+
+    def decode_bboxes2d(self, reg_bboxes2d, base_centers2d):
+        """Retrieve [x1, y1, x2, y2] format 2D bboxes.
+
+        Args:
+            reg_bboxes2d (torch.Tensor): Predicted FCOS style
+                2D bboxes.
+                shape: (N, 4)
+            base_centers2d (torch.Tensor): predicted base centers2d.
+                shape: (N, 2)
+
+        Returns:
+            torch.Tenosr: [x1, y1, x2, y2] format 2D bboxes.
+        """
+        centers_x = base_centers2d[:, 0]
+        centers_y = base_centers2d[:, 1]
+
+        xs_min = centers_x - reg_bboxes2d[..., 0]
+        ys_min = centers_y - reg_bboxes2d[..., 1]
+        xs_max = centers_x + reg_bboxes2d[..., 2]
+        ys_max = centers_y + reg_bboxes2d[..., 3]
+
+        bboxes2d = torch.stack([xs_min, ys_min, xs_max, ys_max], dim=-1)
+
+        return bboxes2d
+
+    def combine_depths(self, depth, depth_uncertainty):
+        """Combine all the prediced depths with depth uncertainty.
+
+        Args:
+            depth (torch.Tensor): Predicted depths of each object.
+                2D bboxes.
+                shape: (N, 4)
+            depth_uncertainty (torch.Tensor): Depth uncertainty for
+                each depth of each object.
+                shape: (N, 4)
+
+        Returns:
+            torch.Tenosr: combined depth.
+        """
+        uncertainty_weights = 1 / depth_uncertainty
+        uncertainty_weights = \
+            uncertainty_weights / \
+            uncertainty_weights.sum(dim=1, keepdim=True)
+        combined_depth = torch.sum(depth * uncertainty_weights, dim=1)
+
+        return combined_depth
diff --git a/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py b/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
index ed8020d..d606954 100644
--- a/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
@@ -1,241 +1,241 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from mmdet.core.bbox import BaseBBoxCoder
-from mmdet.core.bbox.builder import BBOX_CODERS
-
-
-@BBOX_CODERS.register_module()
-class PartialBinBasedBBoxCoder(BaseBBoxCoder):
-    """Partial bin based bbox coder.
-
-    Args:
-        num_dir_bins (int): Number of bins to encode direction angle.
-        num_sizes (int): Number of size clusters.
-        mean_sizes (list[list[int]]): Mean size of bboxes in each class.
-        with_rot (bool): Whether the bbox is with rotation.
-    """
-
-    def __init__(self, num_dir_bins, num_sizes, mean_sizes, with_rot=True):
-        super(PartialBinBasedBBoxCoder, self).__init__()
-        assert len(mean_sizes) == num_sizes
-        self.num_dir_bins = num_dir_bins
-        self.num_sizes = num_sizes
-        self.mean_sizes = mean_sizes
-        self.with_rot = with_rot
-
-    def encode(self, gt_bboxes_3d, gt_labels_3d):
-        """Encode ground truth to prediction targets.
-
-        Args:
-            gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes
-                with shape (n, 7).
-            gt_labels_3d (torch.Tensor): Ground truth classes.
-
-        Returns:
-            tuple: Targets of center, size and direction.
-        """
-        # generate center target
-        center_target = gt_bboxes_3d.gravity_center
-
-        # generate bbox size target
-        size_class_target = gt_labels_3d
-        size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor(
-            self.mean_sizes)[size_class_target]
-
-        # generate dir target
-        box_num = gt_labels_3d.shape[0]
-        if self.with_rot:
-            (dir_class_target,
-             dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
-        else:
-            dir_class_target = gt_labels_3d.new_zeros(box_num)
-            dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
-
-        return (center_target, size_class_target, size_res_target,
-                dir_class_target, dir_res_target)
-
-    def decode(self, bbox_out, suffix=''):
-        """Decode predicted parts to bbox3d.
-
-        Args:
-            bbox_out (dict): Predictions from model, should contain keys below.
-
-                - center: predicted bottom center of bboxes.
-                - dir_class: predicted bbox direction class.
-                - dir_res: predicted bbox direction residual.
-                - size_class: predicted bbox size class.
-                - size_res: predicted bbox size residual.
-            suffix (str): Decode predictions with specific suffix.
-
-        Returns:
-            torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
-        """
-        center = bbox_out['center' + suffix]
-        batch_size, num_proposal = center.shape[:2]
-
-        # decode heading angle
-        if self.with_rot:
-            dir_class = torch.argmax(bbox_out['dir_class' + suffix], -1)
-            dir_res = torch.gather(bbox_out['dir_res' + suffix], 2,
-                                   dir_class.unsqueeze(-1))
-            dir_res.squeeze_(2)
-            dir_angle = self.class2angle(dir_class, dir_res).reshape(
-                batch_size, num_proposal, 1)
-        else:
-            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
-
-        # decode bbox size
-        size_class = torch.argmax(
-            bbox_out['size_class' + suffix], -1, keepdim=True)
-        size_res = torch.gather(bbox_out['size_res' + suffix], 2,
-                                size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
-        mean_sizes = center.new_tensor(self.mean_sizes)
-        size_base = torch.index_select(mean_sizes, 0, size_class.reshape(-1))
-        bbox_size = size_base.reshape(batch_size, num_proposal,
-                                      -1) + size_res.squeeze(2)
-
-        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
-        return bbox3d
-
-    def decode_corners(self, center, size_res, size_class):
-        """Decode center, size residuals and class to corners. Only useful for
-        axis-aligned bounding boxes, so angle isn't considered.
-
-        Args:
-            center (torch.Tensor): Shape [B, N, 3]
-            size_res (torch.Tensor): Shape [B, N, 3] or [B, N, C, 3]
-            size_class (torch.Tensor): Shape: [B, N] or [B, N, 1]
-            or [B, N, C, 3]
-
-        Returns:
-            torch.Tensor: Corners with shape [B, N, 6]
-        """
-        if len(size_class.shape) == 2 or size_class.shape[-1] == 1:
-            batch_size, proposal_num = size_class.shape[:2]
-            one_hot_size_class = size_res.new_zeros(
-                (batch_size, proposal_num, self.num_sizes))
-            if len(size_class.shape) == 2:
-                size_class = size_class.unsqueeze(-1)
-            one_hot_size_class.scatter_(2, size_class, 1)
-            one_hot_size_class_expand = one_hot_size_class.unsqueeze(
-                -1).repeat(1, 1, 1, 3).contiguous()
-        else:
-            one_hot_size_class_expand = size_class
-
-        if len(size_res.shape) == 4:
-            size_res = torch.sum(size_res * one_hot_size_class_expand, 2)
-
-        mean_sizes = size_res.new_tensor(self.mean_sizes)
-        mean_sizes = torch.sum(mean_sizes * one_hot_size_class_expand, 2)
-        size_full = (size_res + 1) * mean_sizes
-        size_full = torch.clamp(size_full, 0)
-        half_size_full = size_full / 2
-        corner1 = center - half_size_full
-        corner2 = center + half_size_full
-        corners = torch.cat([corner1, corner2], dim=-1)
-        return corners
-
-    def split_pred(self, cls_preds, reg_preds, base_xyz):
-        """Split predicted features to specific parts.
-
-        Args:
-            cls_preds (torch.Tensor): Class predicted features to split.
-            reg_preds (torch.Tensor): Regression predicted features to split.
-            base_xyz (torch.Tensor): Coordinates of points.
-
-        Returns:
-            dict[str, torch.Tensor]: Split results.
-        """
-        results = {}
-        start, end = 0, 0
-
-        cls_preds_trans = cls_preds.transpose(2, 1)
-        reg_preds_trans = reg_preds.transpose(2, 1)
-
-        # decode center
-        end += 3
-        # (batch_size, num_proposal, 3)
-        results['center'] = base_xyz + \
-            reg_preds_trans[..., start:end].contiguous()
-        start = end
-
-        # decode direction
-        end += self.num_dir_bins
-        results['dir_class'] = reg_preds_trans[..., start:end].contiguous()
-        start = end
-
-        end += self.num_dir_bins
-        dir_res_norm = reg_preds_trans[..., start:end].contiguous()
-        start = end
-
-        results['dir_res_norm'] = dir_res_norm
-        results['dir_res'] = dir_res_norm * (np.pi / self.num_dir_bins)
-
-        # decode size
-        end += self.num_sizes
-        results['size_class'] = reg_preds_trans[..., start:end].contiguous()
-        start = end
-
-        end += self.num_sizes * 3
-        size_res_norm = reg_preds_trans[..., start:end]
-        batch_size, num_proposal = reg_preds_trans.shape[:2]
-        size_res_norm = size_res_norm.view(
-            [batch_size, num_proposal, self.num_sizes, 3])
-        start = end
-
-        results['size_res_norm'] = size_res_norm.contiguous()
-        mean_sizes = reg_preds.new_tensor(self.mean_sizes)
-        results['size_res'] = (
-            size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
-
-        # decode objectness score
-        start = 0
-        end = 2
-        results['obj_scores'] = cls_preds_trans[..., start:end].contiguous()
-        start = end
-
-        # decode semantic score
-        results['sem_scores'] = cls_preds_trans[..., start:].contiguous()
-
-        return results
-
-    def angle2class(self, angle):
-        """Convert continuous angle to a discrete class and a residual.
-
-        Convert continuous angle to a discrete class and a small
-        regression number from class center angle to current angle.
-
-        Args:
-            angle (torch.Tensor): Angle is from 0-2pi (or -pi~pi),
-                class center at 0, 1*(2pi/N), 2*(2pi/N) ...  (N-1)*(2pi/N).
-
-        Returns:
-            tuple: Encoded discrete class and residual.
-        """
-        angle = angle % (2 * np.pi)
-        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
-        shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
-        angle_cls = shifted_angle // angle_per_class
-        angle_res = shifted_angle - (
-            angle_cls * angle_per_class + angle_per_class / 2)
-        return angle_cls.long(), angle_res
-
-    def class2angle(self, angle_cls, angle_res, limit_period=True):
-        """Inverse function to angle2class.
-
-        Args:
-            angle_cls (torch.Tensor): Angle class to decode.
-            angle_res (torch.Tensor): Angle residual to decode.
-            limit_period (bool): Whether to limit angle to [-pi, pi].
-
-        Returns:
-            torch.Tensor: Angle decoded from angle_cls and angle_res.
-        """
-        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
-        angle_center = angle_cls.float() * angle_per_class
-        angle = angle_center + angle_res
-        if limit_period:
-            angle[angle > np.pi] -= 2 * np.pi
-        return angle
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+
+
+@BBOX_CODERS.register_module()
+class PartialBinBasedBBoxCoder(BaseBBoxCoder):
+    """Partial bin based bbox coder.
+
+    Args:
+        num_dir_bins (int): Number of bins to encode direction angle.
+        num_sizes (int): Number of size clusters.
+        mean_sizes (list[list[int]]): Mean size of bboxes in each class.
+        with_rot (bool): Whether the bbox is with rotation.
+    """
+
+    def __init__(self, num_dir_bins, num_sizes, mean_sizes, with_rot=True):
+        super(PartialBinBasedBBoxCoder, self).__init__()
+        assert len(mean_sizes) == num_sizes
+        self.num_dir_bins = num_dir_bins
+        self.num_sizes = num_sizes
+        self.mean_sizes = mean_sizes
+        self.with_rot = with_rot
+
+    def encode(self, gt_bboxes_3d, gt_labels_3d):
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes
+                with shape (n, 7).
+            gt_labels_3d (torch.Tensor): Ground truth classes.
+
+        Returns:
+            tuple: Targets of center, size and direction.
+        """
+        # generate center target
+        center_target = gt_bboxes_3d.gravity_center
+
+        # generate bbox size target
+        size_class_target = gt_labels_3d
+        size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor(
+            self.mean_sizes)[size_class_target]
+
+        # generate dir target
+        box_num = gt_labels_3d.shape[0]
+        if self.with_rot:
+            (dir_class_target,
+             dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
+        else:
+            dir_class_target = gt_labels_3d.new_zeros(box_num)
+            dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
+
+        return (center_target, size_class_target, size_res_target,
+                dir_class_target, dir_res_target)
+
+    def decode(self, bbox_out, suffix=''):
+        """Decode predicted parts to bbox3d.
+
+        Args:
+            bbox_out (dict): Predictions from model, should contain keys below.
+
+                - center: predicted bottom center of bboxes.
+                - dir_class: predicted bbox direction class.
+                - dir_res: predicted bbox direction residual.
+                - size_class: predicted bbox size class.
+                - size_res: predicted bbox size residual.
+            suffix (str): Decode predictions with specific suffix.
+
+        Returns:
+            torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
+        """
+        center = bbox_out['center' + suffix]
+        batch_size, num_proposal = center.shape[:2]
+
+        # decode heading angle
+        if self.with_rot:
+            dir_class = torch.argmax(bbox_out['dir_class' + suffix], -1)
+            dir_res = torch.gather(bbox_out['dir_res' + suffix], 2,
+                                   dir_class.unsqueeze(-1))
+            dir_res.squeeze_(2)
+            dir_angle = self.class2angle(dir_class, dir_res).reshape(
+                batch_size, num_proposal, 1)
+        else:
+            dir_angle = center.new_zeros(batch_size, num_proposal, 1)
+
+        # decode bbox size
+        size_class = torch.argmax(
+            bbox_out['size_class' + suffix], -1, keepdim=True)
+        size_res = torch.gather(bbox_out['size_res' + suffix], 2,
+                                size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
+        mean_sizes = center.new_tensor(self.mean_sizes)
+        size_base = torch.index_select(mean_sizes, 0, size_class.reshape(-1))
+        bbox_size = size_base.reshape(batch_size, num_proposal,
+                                      -1) + size_res.squeeze(2)
+
+        bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
+        return bbox3d
+
+    def decode_corners(self, center, size_res, size_class):
+        """Decode center, size residuals and class to corners. Only useful for
+        axis-aligned bounding boxes, so angle isn't considered.
+
+        Args:
+            center (torch.Tensor): Shape [B, N, 3]
+            size_res (torch.Tensor): Shape [B, N, 3] or [B, N, C, 3]
+            size_class (torch.Tensor): Shape: [B, N] or [B, N, 1]
+            or [B, N, C, 3]
+
+        Returns:
+            torch.Tensor: Corners with shape [B, N, 6]
+        """
+        if len(size_class.shape) == 2 or size_class.shape[-1] == 1:
+            batch_size, proposal_num = size_class.shape[:2]
+            one_hot_size_class = size_res.new_zeros(
+                (batch_size, proposal_num, self.num_sizes))
+            if len(size_class.shape) == 2:
+                size_class = size_class.unsqueeze(-1)
+            one_hot_size_class.scatter_(2, size_class, 1)
+            one_hot_size_class_expand = one_hot_size_class.unsqueeze(
+                -1).repeat(1, 1, 1, 3).contiguous()
+        else:
+            one_hot_size_class_expand = size_class
+
+        if len(size_res.shape) == 4:
+            size_res = torch.sum(size_res * one_hot_size_class_expand, 2)
+
+        mean_sizes = size_res.new_tensor(self.mean_sizes)
+        mean_sizes = torch.sum(mean_sizes * one_hot_size_class_expand, 2)
+        size_full = (size_res + 1) * mean_sizes
+        size_full = torch.clamp(size_full, 0)
+        half_size_full = size_full / 2
+        corner1 = center - half_size_full
+        corner2 = center + half_size_full
+        corners = torch.cat([corner1, corner2], dim=-1)
+        return corners
+
+    def split_pred(self, cls_preds, reg_preds, base_xyz):
+        """Split predicted features to specific parts.
+
+        Args:
+            cls_preds (torch.Tensor): Class predicted features to split.
+            reg_preds (torch.Tensor): Regression predicted features to split.
+            base_xyz (torch.Tensor): Coordinates of points.
+
+        Returns:
+            dict[str, torch.Tensor]: Split results.
+        """
+        results = {}
+        start, end = 0, 0
+
+        cls_preds_trans = cls_preds.transpose(2, 1)
+        reg_preds_trans = reg_preds.transpose(2, 1)
+
+        # decode center
+        end += 3
+        # (batch_size, num_proposal, 3)
+        results['center'] = base_xyz + \
+            reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        # decode direction
+        end += self.num_dir_bins
+        results['dir_class'] = reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        end += self.num_dir_bins
+        dir_res_norm = reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        results['dir_res_norm'] = dir_res_norm
+        results['dir_res'] = dir_res_norm * (np.pi / self.num_dir_bins)
+
+        # decode size
+        end += self.num_sizes
+        results['size_class'] = reg_preds_trans[..., start:end].contiguous()
+        start = end
+
+        end += self.num_sizes * 3
+        size_res_norm = reg_preds_trans[..., start:end]
+        batch_size, num_proposal = reg_preds_trans.shape[:2]
+        size_res_norm = size_res_norm.view(
+            [batch_size, num_proposal, self.num_sizes, 3])
+        start = end
+
+        results['size_res_norm'] = size_res_norm.contiguous()
+        mean_sizes = reg_preds.new_tensor(self.mean_sizes)
+        results['size_res'] = (
+            size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
+
+        # decode objectness score
+        start = 0
+        end = 2
+        results['obj_scores'] = cls_preds_trans[..., start:end].contiguous()
+        start = end
+
+        # decode semantic score
+        results['sem_scores'] = cls_preds_trans[..., start:].contiguous()
+
+        return results
+
+    def angle2class(self, angle):
+        """Convert continuous angle to a discrete class and a residual.
+
+        Convert continuous angle to a discrete class and a small
+        regression number from class center angle to current angle.
+
+        Args:
+            angle (torch.Tensor): Angle is from 0-2pi (or -pi~pi),
+                class center at 0, 1*(2pi/N), 2*(2pi/N) ...  (N-1)*(2pi/N).
+
+        Returns:
+            tuple: Encoded discrete class and residual.
+        """
+        angle = angle % (2 * np.pi)
+        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
+        shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
+        angle_cls = shifted_angle // angle_per_class
+        angle_res = shifted_angle - (
+            angle_cls * angle_per_class + angle_per_class / 2)
+        return angle_cls.long(), angle_res
+
+    def class2angle(self, angle_cls, angle_res, limit_period=True):
+        """Inverse function to angle2class.
+
+        Args:
+            angle_cls (torch.Tensor): Angle class to decode.
+            angle_res (torch.Tensor): Angle residual to decode.
+            limit_period (bool): Whether to limit angle to [-pi, pi].
+
+        Returns:
+            torch.Tensor: Angle decoded from angle_cls and angle_res.
+        """
+        angle_per_class = 2 * np.pi / float(self.num_dir_bins)
+        angle_center = angle_cls.float() * angle_per_class
+        angle = angle_center + angle_res
+        if limit_period:
+            angle[angle > np.pi] -= 2 * np.pi
+        return angle
diff --git a/mmdet3d/core/bbox/coders/pgd_bbox_coder.py b/mmdet3d/core/bbox/coders/pgd_bbox_coder.py
index 094ed39..6ec8001 100644
--- a/mmdet3d/core/bbox/coders/pgd_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/pgd_bbox_coder.py
@@ -1,128 +1,128 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from torch.nn import functional as F
-
-from mmdet.core.bbox.builder import BBOX_CODERS
-from .fcos3d_bbox_coder import FCOS3DBBoxCoder
-
-
-@BBOX_CODERS.register_module()
-class PGDBBoxCoder(FCOS3DBBoxCoder):
-    """Bounding box coder for PGD."""
-
-    def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels):
-        # TODO: refactor the encoder codes in the FCOS3D and PGD head
-        pass
-
-    def decode_2d(self,
-                  bbox,
-                  scale,
-                  stride,
-                  max_regress_range,
-                  training,
-                  pred_keypoints=False,
-                  pred_bbox2d=True):
-        """Decode regressed 2D attributes.
-
-        Args:
-            bbox (torch.Tensor): Raw bounding box predictions in shape
-                [N, C, H, W].
-            scale (tuple[`Scale`]): Learnable scale parameters.
-            stride (int): Stride for a specific feature level.
-            max_regress_range (int): Maximum regression range for a specific
-                feature level.
-            training (bool): Whether the decoding is in the training
-                procedure.
-            pred_keypoints (bool, optional): Whether to predict keypoints.
-                Defaults to False.
-            pred_bbox2d (bool, optional): Whether to predict 2D bounding
-                boxes. Defaults to False.
-
-        Returns:
-            torch.Tensor: Decoded boxes.
-        """
-        clone_bbox = bbox.clone()
-        if pred_keypoints:
-            scale_kpts = scale[3]
-            # 2 dimension of offsets x 8 corners of a 3D bbox
-            bbox[:, self.bbox_code_size:self.bbox_code_size + 16] = \
-                torch.tanh(scale_kpts(clone_bbox[
-                    :, self.bbox_code_size:self.bbox_code_size + 16]).float())
-
-        if pred_bbox2d:
-            scale_bbox2d = scale[-1]
-            # The last four dimensions are offsets to four sides of a 2D bbox
-            bbox[:, -4:] = scale_bbox2d(clone_bbox[:, -4:]).float()
-
-        if self.norm_on_bbox:
-            if pred_bbox2d:
-                bbox[:, -4:] = F.relu(bbox.clone()[:, -4:])
-            if not training:
-                if pred_keypoints:
-                    bbox[
-                        :, self.bbox_code_size:self.bbox_code_size + 16] *= \
-                           max_regress_range
-                if pred_bbox2d:
-                    bbox[:, -4:] *= stride
-        else:
-            if pred_bbox2d:
-                bbox[:, -4:] = bbox.clone()[:, -4:].exp()
-        return bbox
-
-    def decode_prob_depth(self, depth_cls_preds, depth_range, depth_unit,
-                          division, num_depth_cls):
-        """Decode probabilistic depth map.
-
-        Args:
-            depth_cls_preds (torch.Tensor): Depth probabilistic map in shape
-                [..., self.num_depth_cls] (raw output before softmax).
-            depth_range (tuple[float]): Range of depth estimation.
-            depth_unit (int): Unit of depth range division.
-            division (str): Depth division method. Options include 'uniform',
-                'linear', 'log', 'loguniform'.
-            num_depth_cls (int): Number of depth classes.
-
-        Returns:
-            torch.Tensor: Decoded probabilistic depth estimation.
-        """
-        if division == 'uniform':
-            depth_multiplier = depth_unit * \
-                depth_cls_preds.new_tensor(
-                    list(range(num_depth_cls))).reshape([1, -1])
-            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
-                                depth_multiplier).sum(dim=-1)
-            return prob_depth_preds
-        elif division == 'linear':
-            split_pts = depth_cls_preds.new_tensor(list(
-                range(num_depth_cls))).reshape([1, -1])
-            depth_multiplier = depth_range[0] + (
-                depth_range[1] - depth_range[0]) / \
-                (num_depth_cls * (num_depth_cls - 1)) * \
-                (split_pts * (split_pts+1))
-            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
-                                depth_multiplier).sum(dim=-1)
-            return prob_depth_preds
-        elif division == 'log':
-            split_pts = depth_cls_preds.new_tensor(list(
-                range(num_depth_cls))).reshape([1, -1])
-            start = max(depth_range[0], 1)
-            end = depth_range[1]
-            depth_multiplier = (np.log(start) +
-                                split_pts * np.log(end / start) /
-                                (num_depth_cls - 1)).exp()
-            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
-                                depth_multiplier).sum(dim=-1)
-            return prob_depth_preds
-        elif division == 'loguniform':
-            split_pts = depth_cls_preds.new_tensor(list(
-                range(num_depth_cls))).reshape([1, -1])
-            start = max(depth_range[0], 1)
-            end = depth_range[1]
-            log_multiplier = np.log(start) + \
-                split_pts * np.log(end / start) / (num_depth_cls - 1)
-            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
-                                log_multiplier).sum(dim=-1).exp()
-            return prob_depth_preds
-        else:
-            raise NotImplementedError
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+from mmdet.core.bbox.builder import BBOX_CODERS
+from .fcos3d_bbox_coder import FCOS3DBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class PGDBBoxCoder(FCOS3DBBoxCoder):
+    """Bounding box coder for PGD."""
+
+    def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels):
+        # TODO: refactor the encoder codes in the FCOS3D and PGD head
+        pass
+
+    def decode_2d(self,
+                  bbox,
+                  scale,
+                  stride,
+                  max_regress_range,
+                  training,
+                  pred_keypoints=False,
+                  pred_bbox2d=True):
+        """Decode regressed 2D attributes.
+
+        Args:
+            bbox (torch.Tensor): Raw bounding box predictions in shape
+                [N, C, H, W].
+            scale (tuple[`Scale`]): Learnable scale parameters.
+            stride (int): Stride for a specific feature level.
+            max_regress_range (int): Maximum regression range for a specific
+                feature level.
+            training (bool): Whether the decoding is in the training
+                procedure.
+            pred_keypoints (bool, optional): Whether to predict keypoints.
+                Defaults to False.
+            pred_bbox2d (bool, optional): Whether to predict 2D bounding
+                boxes. Defaults to False.
+
+        Returns:
+            torch.Tensor: Decoded boxes.
+        """
+        clone_bbox = bbox.clone()
+        if pred_keypoints:
+            scale_kpts = scale[3]
+            # 2 dimension of offsets x 8 corners of a 3D bbox
+            bbox[:, self.bbox_code_size:self.bbox_code_size + 16] = \
+                torch.tanh(scale_kpts(clone_bbox[
+                    :, self.bbox_code_size:self.bbox_code_size + 16]).float())
+
+        if pred_bbox2d:
+            scale_bbox2d = scale[-1]
+            # The last four dimensions are offsets to four sides of a 2D bbox
+            bbox[:, -4:] = scale_bbox2d(clone_bbox[:, -4:]).float()
+
+        if self.norm_on_bbox:
+            if pred_bbox2d:
+                bbox[:, -4:] = F.relu(bbox.clone()[:, -4:])
+            if not training:
+                if pred_keypoints:
+                    bbox[
+                        :, self.bbox_code_size:self.bbox_code_size + 16] *= \
+                           max_regress_range
+                if pred_bbox2d:
+                    bbox[:, -4:] *= stride
+        else:
+            if pred_bbox2d:
+                bbox[:, -4:] = bbox.clone()[:, -4:].exp()
+        return bbox
+
+    def decode_prob_depth(self, depth_cls_preds, depth_range, depth_unit,
+                          division, num_depth_cls):
+        """Decode probabilistic depth map.
+
+        Args:
+            depth_cls_preds (torch.Tensor): Depth probabilistic map in shape
+                [..., self.num_depth_cls] (raw output before softmax).
+            depth_range (tuple[float]): Range of depth estimation.
+            depth_unit (int): Unit of depth range division.
+            division (str): Depth division method. Options include 'uniform',
+                'linear', 'log', 'loguniform'.
+            num_depth_cls (int): Number of depth classes.
+
+        Returns:
+            torch.Tensor: Decoded probabilistic depth estimation.
+        """
+        if division == 'uniform':
+            depth_multiplier = depth_unit * \
+                depth_cls_preds.new_tensor(
+                    list(range(num_depth_cls))).reshape([1, -1])
+            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
+                                depth_multiplier).sum(dim=-1)
+            return prob_depth_preds
+        elif division == 'linear':
+            split_pts = depth_cls_preds.new_tensor(list(
+                range(num_depth_cls))).reshape([1, -1])
+            depth_multiplier = depth_range[0] + (
+                depth_range[1] - depth_range[0]) / \
+                (num_depth_cls * (num_depth_cls - 1)) * \
+                (split_pts * (split_pts+1))
+            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
+                                depth_multiplier).sum(dim=-1)
+            return prob_depth_preds
+        elif division == 'log':
+            split_pts = depth_cls_preds.new_tensor(list(
+                range(num_depth_cls))).reshape([1, -1])
+            start = max(depth_range[0], 1)
+            end = depth_range[1]
+            depth_multiplier = (np.log(start) +
+                                split_pts * np.log(end / start) /
+                                (num_depth_cls - 1)).exp()
+            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
+                                depth_multiplier).sum(dim=-1)
+            return prob_depth_preds
+        elif division == 'loguniform':
+            split_pts = depth_cls_preds.new_tensor(list(
+                range(num_depth_cls))).reshape([1, -1])
+            start = max(depth_range[0], 1)
+            end = depth_range[1]
+            log_multiplier = np.log(start) + \
+                split_pts * np.log(end / start) / (num_depth_cls - 1)
+            prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) *
+                                log_multiplier).sum(dim=-1).exp()
+            return prob_depth_preds
+        else:
+            raise NotImplementedError
diff --git a/mmdet3d/core/bbox/coders/point_xyzwhlr_bbox_coder.py b/mmdet3d/core/bbox/coders/point_xyzwhlr_bbox_coder.py
index d246777..0935aca 100644
--- a/mmdet3d/core/bbox/coders/point_xyzwhlr_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/point_xyzwhlr_bbox_coder.py
@@ -1,117 +1,117 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from mmdet.core.bbox import BaseBBoxCoder
-from mmdet.core.bbox.builder import BBOX_CODERS
-
-
-@BBOX_CODERS.register_module()
-class PointXYZWHLRBBoxCoder(BaseBBoxCoder):
-    """Point based bbox coder for 3D boxes.
-
-    Args:
-        code_size (int): The dimension of boxes to be encoded.
-        use_mean_size (bool, optional): Whether using anchors based on class.
-            Defaults to True.
-        mean_size (list[list[float]], optional): Mean size of bboxes in
-            each class. Defaults to None.
-    """
-
-    def __init__(self, code_size=7, use_mean_size=True, mean_size=None):
-        super(PointXYZWHLRBBoxCoder, self).__init__()
-        self.code_size = code_size
-        self.use_mean_size = use_mean_size
-        if self.use_mean_size:
-            self.mean_size = torch.from_numpy(np.array(mean_size)).float()
-            assert self.mean_size.min() > 0, \
-                f'The min of mean_size should > 0, however currently it is '\
-                f'{self.mean_size.min()}, please check it in your config.'
-
-    def encode(self, gt_bboxes_3d, points, gt_labels_3d=None):
-        """Encode ground truth to prediction targets.
-
-        Args:
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth bboxes
-                with shape (N, 7 + C).
-            points (torch.Tensor): Point cloud with shape (N, 3).
-            gt_labels_3d (torch.Tensor, optional): Ground truth classes.
-                Defaults to None.
-
-        Returns:
-            torch.Tensor: Encoded boxes with shape (N, 8 + C).
-        """
-        gt_bboxes_3d[:, 3:6] = torch.clamp_min(gt_bboxes_3d[:, 3:6], min=1e-5)
-
-        xg, yg, zg, dxg, dyg, dzg, rg, *cgs = torch.split(
-            gt_bboxes_3d, 1, dim=-1)
-        xa, ya, za = torch.split(points, 1, dim=-1)
-
-        if self.use_mean_size:
-            assert gt_labels_3d.max() <= self.mean_size.shape[0] - 1, \
-                f'the max gt label {gt_labels_3d.max()} is bigger than' \
-                f'anchor types {self.mean_size.shape[0] - 1}.'
-            self.mean_size = self.mean_size.to(gt_labels_3d.device)
-            point_anchor_size = self.mean_size[gt_labels_3d]
-            dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1)
-            diagonal = torch.sqrt(dxa**2 + dya**2)
-            xt = (xg - xa) / diagonal
-            yt = (yg - ya) / diagonal
-            zt = (zg - za) / dza
-            dxt = torch.log(dxg / dxa)
-            dyt = torch.log(dyg / dya)
-            dzt = torch.log(dzg / dza)
-        else:
-            xt = (xg - xa)
-            yt = (yg - ya)
-            zt = (zg - za)
-            dxt = torch.log(dxg)
-            dyt = torch.log(dyg)
-            dzt = torch.log(dzg)
-
-        return torch.cat(
-            [xt, yt, zt, dxt, dyt, dzt,
-             torch.cos(rg),
-             torch.sin(rg), *cgs],
-            dim=-1)
-
-    def decode(self, box_encodings, points, pred_labels_3d=None):
-        """Decode predicted parts and points to bbox3d.
-
-        Args:
-            box_encodings (torch.Tensor): Encoded boxes with shape (N, 8 + C).
-            points (torch.Tensor): Point cloud with shape (N, 3).
-            pred_labels_3d (torch.Tensor): Bbox predicted labels (N, M).
-
-        Returns:
-            torch.Tensor: Decoded boxes with shape (N, 7 + C)
-        """
-        xt, yt, zt, dxt, dyt, dzt, cost, sint, *cts = torch.split(
-            box_encodings, 1, dim=-1)
-        xa, ya, za = torch.split(points, 1, dim=-1)
-
-        if self.use_mean_size:
-            assert pred_labels_3d.max() <= self.mean_size.shape[0] - 1, \
-                f'The max pred label {pred_labels_3d.max()} is bigger than' \
-                f'anchor types {self.mean_size.shape[0] - 1}.'
-            self.mean_size = self.mean_size.to(pred_labels_3d.device)
-            point_anchor_size = self.mean_size[pred_labels_3d]
-            dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1)
-            diagonal = torch.sqrt(dxa**2 + dya**2)
-            xg = xt * diagonal + xa
-            yg = yt * diagonal + ya
-            zg = zt * dza + za
-
-            dxg = torch.exp(dxt) * dxa
-            dyg = torch.exp(dyt) * dya
-            dzg = torch.exp(dzt) * dza
-        else:
-            xg = xt + xa
-            yg = yt + ya
-            zg = zt + za
-            dxg, dyg, dzg = torch.split(
-                torch.exp(box_encodings[..., 3:6]), 1, dim=-1)
-
-        rg = torch.atan2(sint, cost)
-
-        return torch.cat([xg, yg, zg, dxg, dyg, dzg, rg, *cts], dim=-1)
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+
+
+@BBOX_CODERS.register_module()
+class PointXYZWHLRBBoxCoder(BaseBBoxCoder):
+    """Point based bbox coder for 3D boxes.
+
+    Args:
+        code_size (int): The dimension of boxes to be encoded.
+        use_mean_size (bool, optional): Whether using anchors based on class.
+            Defaults to True.
+        mean_size (list[list[float]], optional): Mean size of bboxes in
+            each class. Defaults to None.
+    """
+
+    def __init__(self, code_size=7, use_mean_size=True, mean_size=None):
+        super(PointXYZWHLRBBoxCoder, self).__init__()
+        self.code_size = code_size
+        self.use_mean_size = use_mean_size
+        if self.use_mean_size:
+            self.mean_size = torch.from_numpy(np.array(mean_size)).float()
+            assert self.mean_size.min() > 0, \
+                f'The min of mean_size should > 0, however currently it is '\
+                f'{self.mean_size.min()}, please check it in your config.'
+
+    def encode(self, gt_bboxes_3d, points, gt_labels_3d=None):
+        """Encode ground truth to prediction targets.
+
+        Args:
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth bboxes
+                with shape (N, 7 + C).
+            points (torch.Tensor): Point cloud with shape (N, 3).
+            gt_labels_3d (torch.Tensor, optional): Ground truth classes.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Encoded boxes with shape (N, 8 + C).
+        """
+        gt_bboxes_3d[:, 3:6] = torch.clamp_min(gt_bboxes_3d[:, 3:6], min=1e-5)
+
+        xg, yg, zg, dxg, dyg, dzg, rg, *cgs = torch.split(
+            gt_bboxes_3d, 1, dim=-1)
+        xa, ya, za = torch.split(points, 1, dim=-1)
+
+        if self.use_mean_size:
+            assert gt_labels_3d.max() <= self.mean_size.shape[0] - 1, \
+                f'the max gt label {gt_labels_3d.max()} is bigger than' \
+                f'anchor types {self.mean_size.shape[0] - 1}.'
+            self.mean_size = self.mean_size.to(gt_labels_3d.device)
+            point_anchor_size = self.mean_size[gt_labels_3d]
+            dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1)
+            diagonal = torch.sqrt(dxa**2 + dya**2)
+            xt = (xg - xa) / diagonal
+            yt = (yg - ya) / diagonal
+            zt = (zg - za) / dza
+            dxt = torch.log(dxg / dxa)
+            dyt = torch.log(dyg / dya)
+            dzt = torch.log(dzg / dza)
+        else:
+            xt = (xg - xa)
+            yt = (yg - ya)
+            zt = (zg - za)
+            dxt = torch.log(dxg)
+            dyt = torch.log(dyg)
+            dzt = torch.log(dzg)
+
+        return torch.cat(
+            [xt, yt, zt, dxt, dyt, dzt,
+             torch.cos(rg),
+             torch.sin(rg), *cgs],
+            dim=-1)
+
+    def decode(self, box_encodings, points, pred_labels_3d=None):
+        """Decode predicted parts and points to bbox3d.
+
+        Args:
+            box_encodings (torch.Tensor): Encoded boxes with shape (N, 8 + C).
+            points (torch.Tensor): Point cloud with shape (N, 3).
+            pred_labels_3d (torch.Tensor): Bbox predicted labels (N, M).
+
+        Returns:
+            torch.Tensor: Decoded boxes with shape (N, 7 + C)
+        """
+        xt, yt, zt, dxt, dyt, dzt, cost, sint, *cts = torch.split(
+            box_encodings, 1, dim=-1)
+        xa, ya, za = torch.split(points, 1, dim=-1)
+
+        if self.use_mean_size:
+            assert pred_labels_3d.max() <= self.mean_size.shape[0] - 1, \
+                f'The max pred label {pred_labels_3d.max()} is bigger than' \
+                f'anchor types {self.mean_size.shape[0] - 1}.'
+            self.mean_size = self.mean_size.to(pred_labels_3d.device)
+            point_anchor_size = self.mean_size[pred_labels_3d]
+            dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1)
+            diagonal = torch.sqrt(dxa**2 + dya**2)
+            xg = xt * diagonal + xa
+            yg = yt * diagonal + ya
+            zg = zt * dza + za
+
+            dxg = torch.exp(dxt) * dxa
+            dyg = torch.exp(dyt) * dya
+            dzg = torch.exp(dzt) * dza
+        else:
+            xg = xt + xa
+            yg = yt + ya
+            zg = zt + za
+            dxg, dyg, dzg = torch.split(
+                torch.exp(box_encodings[..., 3:6]), 1, dim=-1)
+
+        rg = torch.atan2(sint, cost)
+
+        return torch.cat([xg, yg, zg, dxg, dyg, dzg, rg, *cts], dim=-1)
diff --git a/mmdet3d/core/bbox/coders/smoke_bbox_coder.py b/mmdet3d/core/bbox/coders/smoke_bbox_coder.py
index 134af3a..814f332 100644
--- a/mmdet3d/core/bbox/coders/smoke_bbox_coder.py
+++ b/mmdet3d/core/bbox/coders/smoke_bbox_coder.py
@@ -1,208 +1,208 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from mmdet.core.bbox import BaseBBoxCoder
-from mmdet.core.bbox.builder import BBOX_CODERS
-
-
-@BBOX_CODERS.register_module()
-class SMOKECoder(BaseBBoxCoder):
-    """Bbox Coder for SMOKE.
-
-    Args:
-        base_depth (tuple[float]): Depth references for decode box depth.
-        base_dims (tuple[tuple[float]]): Dimension references [l, h, w]
-            for decode box dimension for each category.
-        code_size (int): The dimension of boxes to be encoded.
-    """
-
-    def __init__(self, base_depth, base_dims, code_size):
-        super(SMOKECoder, self).__init__()
-        self.base_depth = base_depth
-        self.base_dims = base_dims
-        self.bbox_code_size = code_size
-
-    def encode(self, locations, dimensions, orientations, input_metas):
-        """Encode CameraInstance3DBoxes by locations, dimensions, orientations.
-
-        Args:
-            locations (Tensor): Center location for 3D boxes.
-                (N, 3)
-            dimensions (Tensor): Dimensions for 3D boxes.
-                shape (N, 3)
-            orientations (Tensor): Orientations for 3D boxes.
-                shape (N, 1)
-            input_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-
-        Return:
-            :obj:`CameraInstance3DBoxes`: 3D bboxes of batch images,
-                shape (N, bbox_code_size).
-        """
-
-        bboxes = torch.cat((locations, dimensions, orientations), dim=1)
-        assert bboxes.shape[1] == self.bbox_code_size, 'bboxes shape dose not'\
-            'match the bbox_code_size.'
-        batch_bboxes = input_metas[0]['box_type_3d'](
-            bboxes, box_dim=self.bbox_code_size)
-
-        return batch_bboxes
-
-    def decode(self,
-               reg,
-               points,
-               labels,
-               cam2imgs,
-               trans_mats,
-               locations=None):
-        """Decode regression into locations, dimensions, orientations.
-
-        Args:
-            reg (Tensor): Batch regression for each predict center2d point.
-                shape: (batch * K (max_objs), C)
-            points(Tensor): Batch projected bbox centers on image plane.
-                shape: (batch * K (max_objs) , 2)
-            labels (Tensor): Batch predict class label for each predict
-                center2d point.
-                shape: (batch, K (max_objs))
-            cam2imgs (Tensor): Batch images' camera intrinsic matrix.
-                shape: kitti (batch, 4, 4)  nuscenes (batch, 3, 3)
-            trans_mats (Tensor): transformation matrix from original image
-                to feature map.
-                shape: (batch, 3, 3)
-            locations (None | Tensor): if locations is None, this function
-                is used to decode while inference, otherwise, it's used while
-                training using the ground truth 3d bbox locations.
-                shape: (batch * K (max_objs), 3)
-
-        Return:
-            tuple(Tensor): The tuple has components below:
-                - locations (Tensor): Centers of 3D boxes.
-                    shape: (batch * K (max_objs), 3)
-                - dimensions (Tensor): Dimensions of 3D boxes.
-                    shape: (batch * K (max_objs), 3)
-                - orientations (Tensor): Orientations of 3D
-                    boxes.
-                    shape: (batch * K (max_objs), 1)
-        """
-        depth_offsets = reg[:, 0]
-        centers2d_offsets = reg[:, 1:3]
-        dimensions_offsets = reg[:, 3:6]
-        orientations = reg[:, 6:8]
-        depths = self._decode_depth(depth_offsets)
-        # get the 3D Bounding box's center location.
-        pred_locations = self._decode_location(points, centers2d_offsets,
-                                               depths, cam2imgs, trans_mats)
-        pred_dimensions = self._decode_dimension(labels, dimensions_offsets)
-        if locations is None:
-            pred_orientations = self._decode_orientation(
-                orientations, pred_locations)
-        else:
-            pred_orientations = self._decode_orientation(
-                orientations, locations)
-
-        return pred_locations, pred_dimensions, pred_orientations
-
-    def _decode_depth(self, depth_offsets):
-        """Transform depth offset to depth."""
-        base_depth = depth_offsets.new_tensor(self.base_depth)
-        depths = depth_offsets * base_depth[1] + base_depth[0]
-
-        return depths
-
-    def _decode_location(self, points, centers2d_offsets, depths, cam2imgs,
-                         trans_mats):
-        """Retrieve objects location in camera coordinate based on projected
-        points.
-
-        Args:
-            points (Tensor): Projected points on feature map in (x, y)
-                shape: (batch * K, 2)
-            centers2d_offset (Tensor): Project points offset in
-                (delta_x, delta_y). shape: (batch * K, 2)
-            depths (Tensor): Object depth z.
-                shape: (batch * K)
-            cam2imgs (Tensor): Batch camera intrinsics matrix.
-                shape: kitti (batch, 4, 4)  nuscenes (batch, 3, 3)
-            trans_mats (Tensor): transformation matrix from original image
-                to feature map.
-                shape: (batch, 3, 3)
-        """
-        # number of points
-        N = centers2d_offsets.shape[0]
-        # batch_size
-        N_batch = cam2imgs.shape[0]
-        batch_id = torch.arange(N_batch).unsqueeze(1)
-        obj_id = batch_id.repeat(1, N // N_batch).flatten()
-        trans_mats_inv = trans_mats.inverse()[obj_id]
-        cam2imgs_inv = cam2imgs.inverse()[obj_id]
-        centers2d = points + centers2d_offsets
-        centers2d_extend = torch.cat((centers2d, centers2d.new_ones(N, 1)),
-                                     dim=1)
-        # expand project points as [N, 3, 1]
-        centers2d_extend = centers2d_extend.unsqueeze(-1)
-        # transform project points back on original image
-        centers2d_img = torch.matmul(trans_mats_inv, centers2d_extend)
-        centers2d_img = centers2d_img * depths.view(N, -1, 1)
-        if cam2imgs.shape[1] == 4:
-            centers2d_img = torch.cat(
-                (centers2d_img, centers2d.new_ones(N, 1, 1)), dim=1)
-        locations = torch.matmul(cam2imgs_inv, centers2d_img).squeeze(2)
-
-        return locations[:, :3]
-
-    def _decode_dimension(self, labels, dims_offset):
-        """Transform dimension offsets to dimension according to its category.
-
-        Args:
-            labels (Tensor): Each points' category id.
-                shape: (N, K)
-            dims_offset (Tensor): Dimension offsets.
-                shape: (N, 3)
-        """
-        labels = labels.flatten().long()
-        base_dims = dims_offset.new_tensor(self.base_dims)
-        dims_select = base_dims[labels, :]
-        dimensions = dims_offset.exp() * dims_select
-
-        return dimensions
-
-    def _decode_orientation(self, ori_vector, locations):
-        """Retrieve object orientation.
-
-        Args:
-            ori_vector (Tensor): Local orientation in [sin, cos] format.
-                shape: (N, 2)
-            locations (Tensor): Object location.
-                shape: (N, 3)
-
-        Return:
-            Tensor: yaw(Orientation). Notice that the yaw's
-                range is [-np.pi, np.pi].
-                shape：(N, 1）
-        """
-        assert len(ori_vector) == len(locations)
-        locations = locations.view(-1, 3)
-        rays = torch.atan(locations[:, 0] / (locations[:, 2] + 1e-7))
-        alphas = torch.atan(ori_vector[:, 0] / (ori_vector[:, 1] + 1e-7))
-
-        # get cosine value positive and negative index.
-        cos_pos_inds = (ori_vector[:, 1] >= 0).nonzero(as_tuple=False)
-        cos_neg_inds = (ori_vector[:, 1] < 0).nonzero(as_tuple=False)
-
-        alphas[cos_pos_inds] -= np.pi / 2
-        alphas[cos_neg_inds] += np.pi / 2
-        # retrieve object rotation y angle.
-        yaws = alphas + rays
-
-        larger_inds = (yaws > np.pi).nonzero(as_tuple=False)
-        small_inds = (yaws < -np.pi).nonzero(as_tuple=False)
-
-        if len(larger_inds) != 0:
-            yaws[larger_inds] -= 2 * np.pi
-        if len(small_inds) != 0:
-            yaws[small_inds] += 2 * np.pi
-
-        yaws = yaws.unsqueeze(-1)
-        return yaws
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+
+
+@BBOX_CODERS.register_module()
+class SMOKECoder(BaseBBoxCoder):
+    """Bbox Coder for SMOKE.
+
+    Args:
+        base_depth (tuple[float]): Depth references for decode box depth.
+        base_dims (tuple[tuple[float]]): Dimension references [l, h, w]
+            for decode box dimension for each category.
+        code_size (int): The dimension of boxes to be encoded.
+    """
+
+    def __init__(self, base_depth, base_dims, code_size):
+        super(SMOKECoder, self).__init__()
+        self.base_depth = base_depth
+        self.base_dims = base_dims
+        self.bbox_code_size = code_size
+
+    def encode(self, locations, dimensions, orientations, input_metas):
+        """Encode CameraInstance3DBoxes by locations, dimensions, orientations.
+
+        Args:
+            locations (Tensor): Center location for 3D boxes.
+                (N, 3)
+            dimensions (Tensor): Dimensions for 3D boxes.
+                shape (N, 3)
+            orientations (Tensor): Orientations for 3D boxes.
+                shape (N, 1)
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Return:
+            :obj:`CameraInstance3DBoxes`: 3D bboxes of batch images,
+                shape (N, bbox_code_size).
+        """
+
+        bboxes = torch.cat((locations, dimensions, orientations), dim=1)
+        assert bboxes.shape[1] == self.bbox_code_size, 'bboxes shape dose not'\
+            'match the bbox_code_size.'
+        batch_bboxes = input_metas[0]['box_type_3d'](
+            bboxes, box_dim=self.bbox_code_size)
+
+        return batch_bboxes
+
+    def decode(self,
+               reg,
+               points,
+               labels,
+               cam2imgs,
+               trans_mats,
+               locations=None):
+        """Decode regression into locations, dimensions, orientations.
+
+        Args:
+            reg (Tensor): Batch regression for each predict center2d point.
+                shape: (batch * K (max_objs), C)
+            points(Tensor): Batch projected bbox centers on image plane.
+                shape: (batch * K (max_objs) , 2)
+            labels (Tensor): Batch predict class label for each predict
+                center2d point.
+                shape: (batch, K (max_objs))
+            cam2imgs (Tensor): Batch images' camera intrinsic matrix.
+                shape: kitti (batch, 4, 4)  nuscenes (batch, 3, 3)
+            trans_mats (Tensor): transformation matrix from original image
+                to feature map.
+                shape: (batch, 3, 3)
+            locations (None | Tensor): if locations is None, this function
+                is used to decode while inference, otherwise, it's used while
+                training using the ground truth 3d bbox locations.
+                shape: (batch * K (max_objs), 3)
+
+        Return:
+            tuple(Tensor): The tuple has components below:
+                - locations (Tensor): Centers of 3D boxes.
+                    shape: (batch * K (max_objs), 3)
+                - dimensions (Tensor): Dimensions of 3D boxes.
+                    shape: (batch * K (max_objs), 3)
+                - orientations (Tensor): Orientations of 3D
+                    boxes.
+                    shape: (batch * K (max_objs), 1)
+        """
+        depth_offsets = reg[:, 0]
+        centers2d_offsets = reg[:, 1:3]
+        dimensions_offsets = reg[:, 3:6]
+        orientations = reg[:, 6:8]
+        depths = self._decode_depth(depth_offsets)
+        # get the 3D Bounding box's center location.
+        pred_locations = self._decode_location(points, centers2d_offsets,
+                                               depths, cam2imgs, trans_mats)
+        pred_dimensions = self._decode_dimension(labels, dimensions_offsets)
+        if locations is None:
+            pred_orientations = self._decode_orientation(
+                orientations, pred_locations)
+        else:
+            pred_orientations = self._decode_orientation(
+                orientations, locations)
+
+        return pred_locations, pred_dimensions, pred_orientations
+
+    def _decode_depth(self, depth_offsets):
+        """Transform depth offset to depth."""
+        base_depth = depth_offsets.new_tensor(self.base_depth)
+        depths = depth_offsets * base_depth[1] + base_depth[0]
+
+        return depths
+
+    def _decode_location(self, points, centers2d_offsets, depths, cam2imgs,
+                         trans_mats):
+        """Retrieve objects location in camera coordinate based on projected
+        points.
+
+        Args:
+            points (Tensor): Projected points on feature map in (x, y)
+                shape: (batch * K, 2)
+            centers2d_offset (Tensor): Project points offset in
+                (delta_x, delta_y). shape: (batch * K, 2)
+            depths (Tensor): Object depth z.
+                shape: (batch * K)
+            cam2imgs (Tensor): Batch camera intrinsics matrix.
+                shape: kitti (batch, 4, 4)  nuscenes (batch, 3, 3)
+            trans_mats (Tensor): transformation matrix from original image
+                to feature map.
+                shape: (batch, 3, 3)
+        """
+        # number of points
+        N = centers2d_offsets.shape[0]
+        # batch_size
+        N_batch = cam2imgs.shape[0]
+        batch_id = torch.arange(N_batch).unsqueeze(1)
+        obj_id = batch_id.repeat(1, N // N_batch).flatten()
+        trans_mats_inv = trans_mats.inverse()[obj_id]
+        cam2imgs_inv = cam2imgs.inverse()[obj_id]
+        centers2d = points + centers2d_offsets
+        centers2d_extend = torch.cat((centers2d, centers2d.new_ones(N, 1)),
+                                     dim=1)
+        # expand project points as [N, 3, 1]
+        centers2d_extend = centers2d_extend.unsqueeze(-1)
+        # transform project points back on original image
+        centers2d_img = torch.matmul(trans_mats_inv, centers2d_extend)
+        centers2d_img = centers2d_img * depths.view(N, -1, 1)
+        if cam2imgs.shape[1] == 4:
+            centers2d_img = torch.cat(
+                (centers2d_img, centers2d.new_ones(N, 1, 1)), dim=1)
+        locations = torch.matmul(cam2imgs_inv, centers2d_img).squeeze(2)
+
+        return locations[:, :3]
+
+    def _decode_dimension(self, labels, dims_offset):
+        """Transform dimension offsets to dimension according to its category.
+
+        Args:
+            labels (Tensor): Each points' category id.
+                shape: (N, K)
+            dims_offset (Tensor): Dimension offsets.
+                shape: (N, 3)
+        """
+        labels = labels.flatten().long()
+        base_dims = dims_offset.new_tensor(self.base_dims)
+        dims_select = base_dims[labels, :]
+        dimensions = dims_offset.exp() * dims_select
+
+        return dimensions
+
+    def _decode_orientation(self, ori_vector, locations):
+        """Retrieve object orientation.
+
+        Args:
+            ori_vector (Tensor): Local orientation in [sin, cos] format.
+                shape: (N, 2)
+            locations (Tensor): Object location.
+                shape: (N, 3)
+
+        Return:
+            Tensor: yaw(Orientation). Notice that the yaw's
+                range is [-np.pi, np.pi].
+                shape：(N, 1）
+        """
+        assert len(ori_vector) == len(locations)
+        locations = locations.view(-1, 3)
+        rays = torch.atan(locations[:, 0] / (locations[:, 2] + 1e-7))
+        alphas = torch.atan(ori_vector[:, 0] / (ori_vector[:, 1] + 1e-7))
+
+        # get cosine value positive and negative index.
+        cos_pos_inds = (ori_vector[:, 1] >= 0).nonzero(as_tuple=False)
+        cos_neg_inds = (ori_vector[:, 1] < 0).nonzero(as_tuple=False)
+
+        alphas[cos_pos_inds] -= np.pi / 2
+        alphas[cos_neg_inds] += np.pi / 2
+        # retrieve object rotation y angle.
+        yaws = alphas + rays
+
+        larger_inds = (yaws > np.pi).nonzero(as_tuple=False)
+        small_inds = (yaws < -np.pi).nonzero(as_tuple=False)
+
+        if len(larger_inds) != 0:
+            yaws[larger_inds] -= 2 * np.pi
+        if len(small_inds) != 0:
+            yaws[small_inds] += 2 * np.pi
+
+        yaws = yaws.unsqueeze(-1)
+        return yaws
diff --git a/mmdet3d/core/bbox/iou_calculators/__init__.py b/mmdet3d/core/bbox/iou_calculators/__init__.py
index d2faf69..6518f11 100644
--- a/mmdet3d/core/bbox/iou_calculators/__init__.py
+++ b/mmdet3d/core/bbox/iou_calculators/__init__.py
@@ -1,11 +1,11 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
-                               BboxOverlapsNearest3D,
-                               axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
-                               bbox_overlaps_nearest_3d)
-
-__all__ = [
-    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
-    'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D',
-    'axis_aligned_bbox_overlaps_3d'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
+                               BboxOverlapsNearest3D,
+                               axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
+                               bbox_overlaps_nearest_3d)
+
+__all__ = [
+    'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
+    'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D',
+    'axis_aligned_bbox_overlaps_3d'
+]
diff --git a/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py b/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
index 2b1d8ea..a89a92c 100644
--- a/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
+++ b/mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
@@ -1,329 +1,329 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet.core.bbox import bbox_overlaps
-from mmdet.core.bbox.iou_calculators.builder import IOU_CALCULATORS
-from ..structures import get_box_type
-
-
-@IOU_CALCULATORS.register_module()
-class BboxOverlapsNearest3D(object):
-    """Nearest 3D IoU Calculator.
-
-    Note:
-        This IoU calculator first finds the nearest 2D boxes in bird eye view
-        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
-
-    Args:
-        coordinate (str): 'camera', 'lidar', or 'depth' coordinate system.
-    """
-
-    def __init__(self, coordinate='lidar'):
-        assert coordinate in ['camera', 'lidar', 'depth']
-        self.coordinate = coordinate
-
-    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
-        """Calculate nearest 3D IoU.
-
-        Note:
-            If ``is_aligned`` is ``False``, then it calculates the ious between
-            each bbox of bboxes1 and bboxes2, otherwise it calculates the ious
-            between each aligned pair of bboxes1 and bboxes2.
-
-        Args:
-            bboxes1 (torch.Tensor): shape (N, 7+N)
-                [x, y, z, x_size, y_size, z_size, ry, v].
-            bboxes2 (torch.Tensor): shape (M, 7+N)
-                [x, y, z, x_size, y_size, z_size, ry, v].
-            mode (str): "iou" (intersection over union) or iof
-                (intersection over foreground).
-            is_aligned (bool): Whether the calculation is aligned.
-
-        Return:
-            torch.Tensor: If ``is_aligned`` is ``True``, return ious between
-                bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
-                ``False``, return shape is M.
-        """
-        return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned,
-                                        self.coordinate)
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(coordinate={self.coordinate}'
-        return repr_str
-
-
-@IOU_CALCULATORS.register_module()
-class BboxOverlaps3D(object):
-    """3D IoU Calculator.
-
-    Args:
-        coordinate (str): The coordinate system, valid options are
-            'camera', 'lidar', and 'depth'.
-    """
-
-    def __init__(self, coordinate):
-        assert coordinate in ['camera', 'lidar', 'depth']
-        self.coordinate = coordinate
-
-    def __call__(self, bboxes1, bboxes2, mode='iou'):
-        """Calculate 3D IoU using cuda implementation.
-
-        Note:
-            This function calculate the IoU of 3D boxes based on their volumes.
-            IoU calculator ``:class:BboxOverlaps3D`` uses this function to
-            calculate the actual 3D IoUs of boxes.
-
-        Args:
-            bboxes1 (torch.Tensor): with shape (N, 7+C),
-                (x, y, z, x_size, y_size, z_size, ry, v*).
-            bboxes2 (torch.Tensor): with shape (M, 7+C),
-                (x, y, z, x_size, y_size, z_size, ry, v*).
-            mode (str): "iou" (intersection over union) or
-                iof (intersection over foreground).
-
-        Return:
-            torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
-                with shape (M, N) (aligned mode is not supported currently).
-        """
-        return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate)
-
-    def __repr__(self):
-        """str: return a string that describes the module"""
-        repr_str = self.__class__.__name__
-        repr_str += f'(coordinate={self.coordinate}'
-        return repr_str
-
-
-def bbox_overlaps_nearest_3d(bboxes1,
-                             bboxes2,
-                             mode='iou',
-                             is_aligned=False,
-                             coordinate='lidar'):
-    """Calculate nearest 3D IoU.
-
-    Note:
-        This function first finds the nearest 2D boxes in bird eye view
-        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
-        This IoU calculator :class:`BboxOverlapsNearest3D` uses this
-        function to calculate IoUs of boxes.
-
-        If ``is_aligned`` is ``False``, then it calculates the ious between
-        each bbox of bboxes1 and bboxes2, otherwise the ious between each
-        aligned pair of bboxes1 and bboxes2.
-
-    Args:
-        bboxes1 (torch.Tensor): with shape (N, 7+C),
-            (x, y, z, x_size, y_size, z_size, ry, v*).
-        bboxes2 (torch.Tensor): with shape (M, 7+C),
-            (x, y, z, x_size, y_size, z_size, ry, v*).
-        mode (str): "iou" (intersection over union) or iof
-            (intersection over foreground).
-        is_aligned (bool): Whether the calculation is aligned
-
-    Return:
-        torch.Tensor: If ``is_aligned`` is ``True``, return ious between
-            bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
-            ``False``, return shape is M.
-    """
-    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
-
-    box_type, _ = get_box_type(coordinate)
-
-    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
-    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
-
-    # Change the bboxes to bev
-    # box conversion and iou calculation in torch version on CUDA
-    # is 10x faster than that in numpy version
-    bboxes1_bev = bboxes1.nearest_bev
-    bboxes2_bev = bboxes2.nearest_bev
-
-    ret = bbox_overlaps(
-        bboxes1_bev, bboxes2_bev, mode=mode, is_aligned=is_aligned)
-    return ret
-
-
-def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
-    """Calculate 3D IoU using cuda implementation.
-
-    Note:
-        This function calculates the IoU of 3D boxes based on their volumes.
-        IoU calculator :class:`BboxOverlaps3D` uses this function to
-        calculate the actual IoUs of boxes.
-
-    Args:
-        bboxes1 (torch.Tensor): with shape (N, 7+C),
-            (x, y, z, x_size, y_size, z_size, ry, v*).
-        bboxes2 (torch.Tensor): with shape (M, 7+C),
-            (x, y, z, x_size, y_size, z_size, ry, v*).
-        mode (str): "iou" (intersection over union) or
-            iof (intersection over foreground).
-        coordinate (str): 'camera' or 'lidar' coordinate system.
-
-    Return:
-        torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
-            with shape (M, N) (aligned mode is not supported currently).
-    """
-    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
-
-    box_type, _ = get_box_type(coordinate)
-
-    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
-    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
-
-    return bboxes1.overlaps(bboxes1, bboxes2, mode=mode)
-
-
-@IOU_CALCULATORS.register_module()
-class AxisAlignedBboxOverlaps3D(object):
-    """Axis-aligned 3D Overlaps (IoU) Calculator."""
-
-    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
-        """Calculate IoU between 2D bboxes.
-
-        Args:
-            bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
-                format or empty.
-            bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
-                format or empty.
-                B indicates the batch dim, in shape (B1, B2, ..., Bn).
-                If ``is_aligned`` is ``True``, then m and n must be equal.
-            mode (str): "iou" (intersection over union) or "giou" (generalized
-                intersection over union).
-            is_aligned (bool, optional): If True, then m and n must be equal.
-                Defaults to False.
-        Returns:
-            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
-        """
-        assert bboxes1.size(-1) == bboxes2.size(-1) == 6
-        return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode,
-                                             is_aligned)
-
-    def __repr__(self):
-        """str: a string describing the module"""
-        repr_str = self.__class__.__name__ + '()'
-        return repr_str
-
-
-def axis_aligned_bbox_overlaps_3d(bboxes1,
-                                  bboxes2,
-                                  mode='iou',
-                                  is_aligned=False,
-                                  eps=1e-6):
-    """Calculate overlap between two set of axis aligned 3D bboxes. If
-    ``is_aligned`` is ``False``, then calculate the overlaps between each bbox
-    of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of
-    bboxes1 and bboxes2.
-
-    Args:
-        bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
-            format or empty.
-        bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
-            format or empty.
-            B indicates the batch dim, in shape (B1, B2, ..., Bn).
-            If ``is_aligned`` is ``True``, then m and n must be equal.
-        mode (str): "iou" (intersection over union) or "giou" (generalized
-            intersection over union).
-        is_aligned (bool, optional): If True, then m and n must be equal.
-            Defaults to False.
-        eps (float, optional): A value added to the denominator for numerical
-            stability. Defaults to 1e-6.
-
-    Returns:
-        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
-
-    Example:
-        >>> bboxes1 = torch.FloatTensor([
-        >>>     [0, 0, 0, 10, 10, 10],
-        >>>     [10, 10, 10, 20, 20, 20],
-        >>>     [32, 32, 32, 38, 40, 42],
-        >>> ])
-        >>> bboxes2 = torch.FloatTensor([
-        >>>     [0, 0, 0, 10, 20, 20],
-        >>>     [0, 10, 10, 10, 19, 20],
-        >>>     [10, 10, 10, 20, 20, 20],
-        >>> ])
-        >>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2)
-        >>> assert overlaps.shape == (3, 3)
-        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
-        >>> assert overlaps.shape == (3, )
-    Example:
-        >>> empty = torch.empty(0, 6)
-        >>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]])
-        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
-        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
-        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
-    """
-
-    assert mode in ['iou', 'giou'], f'Unsupported mode {mode}'
-    # Either the boxes are empty or the length of boxes's last dimension is 6
-    assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0)
-    assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0)
-
-    # Batch dim must be the same
-    # Batch dim: (B1, B2, ... Bn)
-    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
-    batch_shape = bboxes1.shape[:-2]
-
-    rows = bboxes1.size(-2)
-    cols = bboxes2.size(-2)
-    if is_aligned:
-        assert rows == cols
-
-    if rows * cols == 0:
-        if is_aligned:
-            return bboxes1.new(batch_shape + (rows, ))
-        else:
-            return bboxes1.new(batch_shape + (rows, cols))
-
-    area1 = (bboxes1[..., 3] -
-             bboxes1[..., 0]) * (bboxes1[..., 4] - bboxes1[..., 1]) * (
-                 bboxes1[..., 5] - bboxes1[..., 2])
-    area2 = (bboxes2[..., 3] -
-             bboxes2[..., 0]) * (bboxes2[..., 4] - bboxes2[..., 1]) * (
-                 bboxes2[..., 5] - bboxes2[..., 2])
-
-    if is_aligned:
-        lt = torch.max(bboxes1[..., :3], bboxes2[..., :3])  # [B, rows, 3]
-        rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:])  # [B, rows, 3]
-
-        wh = (rb - lt).clamp(min=0)  # [B, rows, 2]
-        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
-
-        if mode in ['iou', 'giou']:
-            union = area1 + area2 - overlap
-        else:
-            union = area1
-        if mode == 'giou':
-            enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3])
-            enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:])
-    else:
-        lt = torch.max(bboxes1[..., :, None, :3],
-                       bboxes2[..., None, :, :3])  # [B, rows, cols, 3]
-        rb = torch.min(bboxes1[..., :, None, 3:],
-                       bboxes2[..., None, :, 3:])  # [B, rows, cols, 3]
-
-        wh = (rb - lt).clamp(min=0)  # [B, rows, cols, 3]
-        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
-
-        if mode in ['iou', 'giou']:
-            union = area1[..., None] + area2[..., None, :] - overlap
-        if mode == 'giou':
-            enclosed_lt = torch.min(bboxes1[..., :, None, :3],
-                                    bboxes2[..., None, :, :3])
-            enclosed_rb = torch.max(bboxes1[..., :, None, 3:],
-                                    bboxes2[..., None, :, 3:])
-
-    eps = union.new_tensor([eps])
-    union = torch.max(union, eps)
-    ious = overlap / union
-    if mode in ['iou']:
-        return ious
-    # calculate gious
-    enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
-    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2]
-    enclose_area = torch.max(enclose_area, eps)
-    gious = ious - (enclose_area - union) / enclose_area
-    return gious
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core.bbox import bbox_overlaps
+from mmdet.core.bbox.iou_calculators.builder import IOU_CALCULATORS
+from ..structures import get_box_type
+
+
+@IOU_CALCULATORS.register_module()
+class BboxOverlapsNearest3D(object):
+    """Nearest 3D IoU Calculator.
+
+    Note:
+        This IoU calculator first finds the nearest 2D boxes in bird eye view
+        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
+
+    Args:
+        coordinate (str): 'camera', 'lidar', or 'depth' coordinate system.
+    """
+
+    def __init__(self, coordinate='lidar'):
+        assert coordinate in ['camera', 'lidar', 'depth']
+        self.coordinate = coordinate
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate nearest 3D IoU.
+
+        Note:
+            If ``is_aligned`` is ``False``, then it calculates the ious between
+            each bbox of bboxes1 and bboxes2, otherwise it calculates the ious
+            between each aligned pair of bboxes1 and bboxes2.
+
+        Args:
+            bboxes1 (torch.Tensor): shape (N, 7+N)
+                [x, y, z, x_size, y_size, z_size, ry, v].
+            bboxes2 (torch.Tensor): shape (M, 7+N)
+                [x, y, z, x_size, y_size, z_size, ry, v].
+            mode (str): "iou" (intersection over union) or iof
+                (intersection over foreground).
+            is_aligned (bool): Whether the calculation is aligned.
+
+        Return:
+            torch.Tensor: If ``is_aligned`` is ``True``, return ious between
+                bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
+                ``False``, return shape is M.
+        """
+        return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned,
+                                        self.coordinate)
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(coordinate={self.coordinate}'
+        return repr_str
+
+
+@IOU_CALCULATORS.register_module()
+class BboxOverlaps3D(object):
+    """3D IoU Calculator.
+
+    Args:
+        coordinate (str): The coordinate system, valid options are
+            'camera', 'lidar', and 'depth'.
+    """
+
+    def __init__(self, coordinate):
+        assert coordinate in ['camera', 'lidar', 'depth']
+        self.coordinate = coordinate
+
+    def __call__(self, bboxes1, bboxes2, mode='iou'):
+        """Calculate 3D IoU using cuda implementation.
+
+        Note:
+            This function calculate the IoU of 3D boxes based on their volumes.
+            IoU calculator ``:class:BboxOverlaps3D`` uses this function to
+            calculate the actual 3D IoUs of boxes.
+
+        Args:
+            bboxes1 (torch.Tensor): with shape (N, 7+C),
+                (x, y, z, x_size, y_size, z_size, ry, v*).
+            bboxes2 (torch.Tensor): with shape (M, 7+C),
+                (x, y, z, x_size, y_size, z_size, ry, v*).
+            mode (str): "iou" (intersection over union) or
+                iof (intersection over foreground).
+
+        Return:
+            torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
+                with shape (M, N) (aligned mode is not supported currently).
+        """
+        return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate)
+
+    def __repr__(self):
+        """str: return a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(coordinate={self.coordinate}'
+        return repr_str
+
+
+def bbox_overlaps_nearest_3d(bboxes1,
+                             bboxes2,
+                             mode='iou',
+                             is_aligned=False,
+                             coordinate='lidar'):
+    """Calculate nearest 3D IoU.
+
+    Note:
+        This function first finds the nearest 2D boxes in bird eye view
+        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
+        This IoU calculator :class:`BboxOverlapsNearest3D` uses this
+        function to calculate IoUs of boxes.
+
+        If ``is_aligned`` is ``False``, then it calculates the ious between
+        each bbox of bboxes1 and bboxes2, otherwise the ious between each
+        aligned pair of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (torch.Tensor): with shape (N, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        bboxes2 (torch.Tensor): with shape (M, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        mode (str): "iou" (intersection over union) or iof
+            (intersection over foreground).
+        is_aligned (bool): Whether the calculation is aligned
+
+    Return:
+        torch.Tensor: If ``is_aligned`` is ``True``, return ious between
+            bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is
+            ``False``, return shape is M.
+    """
+    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
+
+    box_type, _ = get_box_type(coordinate)
+
+    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
+    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
+
+    # Change the bboxes to bev
+    # box conversion and iou calculation in torch version on CUDA
+    # is 10x faster than that in numpy version
+    bboxes1_bev = bboxes1.nearest_bev
+    bboxes2_bev = bboxes2.nearest_bev
+
+    ret = bbox_overlaps(
+        bboxes1_bev, bboxes2_bev, mode=mode, is_aligned=is_aligned)
+    return ret
+
+
+def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
+    """Calculate 3D IoU using cuda implementation.
+
+    Note:
+        This function calculates the IoU of 3D boxes based on their volumes.
+        IoU calculator :class:`BboxOverlaps3D` uses this function to
+        calculate the actual IoUs of boxes.
+
+    Args:
+        bboxes1 (torch.Tensor): with shape (N, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        bboxes2 (torch.Tensor): with shape (M, 7+C),
+            (x, y, z, x_size, y_size, z_size, ry, v*).
+        mode (str): "iou" (intersection over union) or
+            iof (intersection over foreground).
+        coordinate (str): 'camera' or 'lidar' coordinate system.
+
+    Return:
+        torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2
+            with shape (M, N) (aligned mode is not supported currently).
+    """
+    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
+
+    box_type, _ = get_box_type(coordinate)
+
+    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
+    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
+
+    return bboxes1.overlaps(bboxes1, bboxes2, mode=mode)
+
+
+@IOU_CALCULATORS.register_module()
+class AxisAlignedBboxOverlaps3D(object):
+    """Axis-aligned 3D Overlaps (IoU) Calculator."""
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate IoU between 2D bboxes.
+
+        Args:
+            bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
+                format or empty.
+            bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
+                format or empty.
+                B indicates the batch dim, in shape (B1, B2, ..., Bn).
+                If ``is_aligned`` is ``True``, then m and n must be equal.
+            mode (str): "iou" (intersection over union) or "giou" (generalized
+                intersection over union).
+            is_aligned (bool, optional): If True, then m and n must be equal.
+                Defaults to False.
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+        """
+        assert bboxes1.size(-1) == bboxes2.size(-1) == 6
+        return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode,
+                                             is_aligned)
+
+    def __repr__(self):
+        """str: a string describing the module"""
+        repr_str = self.__class__.__name__ + '()'
+        return repr_str
+
+
+def axis_aligned_bbox_overlaps_3d(bboxes1,
+                                  bboxes2,
+                                  mode='iou',
+                                  is_aligned=False,
+                                  eps=1e-6):
+    """Calculate overlap between two set of axis aligned 3D bboxes. If
+    ``is_aligned`` is ``False``, then calculate the overlaps between each bbox
+    of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
+            format or empty.
+        bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
+            format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned`` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or "giou" (generalized
+            intersection over union).
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Defaults to False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Defaults to 1e-6.
+
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 0, 10, 10, 10],
+        >>>     [10, 10, 10, 20, 20, 20],
+        >>>     [32, 32, 32, 38, 40, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 0, 10, 20, 20],
+        >>>     [0, 10, 10, 10, 19, 20],
+        >>>     [10, 10, 10, 20, 20, 20],
+        >>> ])
+        >>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+    Example:
+        >>> empty = torch.empty(0, 6)
+        >>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes's last dimension is 6
+    assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 3] -
+             bboxes1[..., 0]) * (bboxes1[..., 4] - bboxes1[..., 1]) * (
+                 bboxes1[..., 5] - bboxes1[..., 2])
+    area2 = (bboxes2[..., 3] -
+             bboxes2[..., 0]) * (bboxes2[..., 4] - bboxes2[..., 1]) * (
+                 bboxes2[..., 5] - bboxes2[..., 2])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :3], bboxes2[..., :3])  # [B, rows, 3]
+        rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:])  # [B, rows, 3]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, 2]
+        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3])
+            enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :3],
+                       bboxes2[..., None, :, :3])  # [B, rows, cols, 3]
+        rb = torch.min(bboxes1[..., :, None, 3:],
+                       bboxes2[..., None, :, 3:])  # [B, rows, cols, 3]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, cols, 3]
+        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :3],
+                                    bboxes2[..., None, :, :3])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 3:],
+                                    bboxes2[..., None, :, 3:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou']:
+        return ious
+    # calculate gious
+    enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
diff --git a/mmdet3d/core/bbox/samplers/__init__.py b/mmdet3d/core/bbox/samplers/__init__.py
index 168780b..f1b408d 100644
--- a/mmdet3d/core/bbox/samplers/__init__.py
+++ b/mmdet3d/core/bbox/samplers/__init__.py
@@ -1,13 +1,13 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.core.bbox.samplers import (BaseSampler, CombinedSampler,
-                                      InstanceBalancedPosSampler,
-                                      IoUBalancedNegSampler, OHEMSampler,
-                                      PseudoSampler, RandomSampler,
-                                      SamplingResult)
-from .iou_neg_piecewise_sampler import IoUNegPiecewiseSampler
-
-__all__ = [
-    'BaseSampler', 'PseudoSampler', 'RandomSampler',
-    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
-    'OHEMSampler', 'SamplingResult', 'IoUNegPiecewiseSampler'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.core.bbox.samplers import (BaseSampler, CombinedSampler,
+                                      InstanceBalancedPosSampler,
+                                      IoUBalancedNegSampler, OHEMSampler,
+                                      PseudoSampler, RandomSampler,
+                                      SamplingResult)
+from .iou_neg_piecewise_sampler import IoUNegPiecewiseSampler
+
+__all__ = [
+    'BaseSampler', 'PseudoSampler', 'RandomSampler',
+    'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
+    'OHEMSampler', 'SamplingResult', 'IoUNegPiecewiseSampler'
+]
diff --git a/mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py b/mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py
index cbd8483..b5eb5a2 100644
--- a/mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py
+++ b/mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py
@@ -1,183 +1,183 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet.core.bbox.builder import BBOX_SAMPLERS
-from . import RandomSampler, SamplingResult
-
-
-@BBOX_SAMPLERS.register_module()
-class IoUNegPiecewiseSampler(RandomSampler):
-    """IoU Piece-wise Sampling.
-
-    Sampling negative proposals according to a list of IoU thresholds.
-    The negative proposals are divided into several pieces according
-    to `neg_iou_piece_thrs`. And the ratio of each piece is indicated
-    by `neg_piece_fractions`.
-
-    Args:
-        num (int): Number of proposals.
-        pos_fraction (float): The fraction of positive proposals.
-        neg_piece_fractions (list): A list contains fractions that indicates
-            the ratio of each piece of total negative samplers.
-        neg_iou_piece_thrs (list): A list contains IoU thresholds that
-            indicate the upper bound of this piece.
-        neg_pos_ub (float): The total ratio to limit the upper bound
-            number of negative samples.
-        add_gt_as_proposals (bool): Whether to add gt as proposals.
-    """
-
-    def __init__(self,
-                 num,
-                 pos_fraction=None,
-                 neg_piece_fractions=None,
-                 neg_iou_piece_thrs=None,
-                 neg_pos_ub=-1,
-                 add_gt_as_proposals=False,
-                 return_iou=False):
-        super(IoUNegPiecewiseSampler,
-              self).__init__(num, pos_fraction, neg_pos_ub,
-                             add_gt_as_proposals)
-        assert isinstance(neg_piece_fractions, list)
-        assert len(neg_piece_fractions) == len(neg_iou_piece_thrs)
-        self.neg_piece_fractions = neg_piece_fractions
-        self.neg_iou_thr = neg_iou_piece_thrs
-        self.return_iou = return_iou
-        self.neg_piece_num = len(self.neg_piece_fractions)
-
-    def _sample_pos(self, assign_result, num_expected, **kwargs):
-        """Randomly sample some positive samples."""
-        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
-        if pos_inds.numel() != 0:
-            pos_inds = pos_inds.squeeze(1)
-        if pos_inds.numel() <= num_expected:
-            return pos_inds
-        else:
-            return self.random_choice(pos_inds, num_expected)
-
-    def _sample_neg(self, assign_result, num_expected, **kwargs):
-        """Randomly sample some negative samples."""
-        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
-        if neg_inds.numel() != 0:
-            neg_inds = neg_inds.squeeze(1)
-        if len(neg_inds) <= 0:
-            return neg_inds.squeeze(1)
-        else:
-            neg_inds_choice = neg_inds.new_zeros([0])
-            extend_num = 0
-            max_overlaps = assign_result.max_overlaps[neg_inds]
-
-            for piece_inds in range(self.neg_piece_num):
-                if piece_inds == self.neg_piece_num - 1:  # for the last piece
-                    piece_expected_num = num_expected - len(neg_inds_choice)
-                    min_iou_thr = 0
-                else:
-                    # if the numbers of negative samplers in previous
-                    # pieces are less than the expected number, extend
-                    # the same number in the current piece.
-                    piece_expected_num = int(
-                        num_expected *
-                        self.neg_piece_fractions[piece_inds]) + extend_num
-                    min_iou_thr = self.neg_iou_thr[piece_inds + 1]
-                max_iou_thr = self.neg_iou_thr[piece_inds]
-                piece_neg_inds = torch.nonzero(
-                    (max_overlaps >= min_iou_thr)
-                    & (max_overlaps < max_iou_thr),
-                    as_tuple=False).view(-1)
-
-                if len(piece_neg_inds) < piece_expected_num:
-                    neg_inds_choice = torch.cat(
-                        [neg_inds_choice, neg_inds[piece_neg_inds]], dim=0)
-                    extend_num += piece_expected_num - len(piece_neg_inds)
-
-                    # for the last piece
-                    if piece_inds == self.neg_piece_num - 1:
-                        extend_neg_num = num_expected - len(neg_inds_choice)
-                        # if the numbers of nagetive samples > 0, we will
-                        # randomly select num_expected samples in last piece
-                        if piece_neg_inds.numel() > 0:
-                            rand_idx = torch.randint(
-                                low=0,
-                                high=piece_neg_inds.numel(),
-                                size=(extend_neg_num, )).long()
-                            neg_inds_choice = torch.cat(
-                                [neg_inds_choice, piece_neg_inds[rand_idx]],
-                                dim=0)
-                        # if the numbers of nagetive samples == 0, we will
-                        # randomly select num_expected samples in all
-                        # previous pieces
-                        else:
-                            rand_idx = torch.randint(
-                                low=0,
-                                high=neg_inds_choice.numel(),
-                                size=(extend_neg_num, )).long()
-                            neg_inds_choice = torch.cat(
-                                [neg_inds_choice, neg_inds_choice[rand_idx]],
-                                dim=0)
-                else:
-                    piece_choice = self.random_choice(piece_neg_inds,
-                                                      piece_expected_num)
-                    neg_inds_choice = torch.cat(
-                        [neg_inds_choice, neg_inds[piece_choice]], dim=0)
-                    extend_num = 0
-            assert len(neg_inds_choice) == num_expected
-            return neg_inds_choice
-
-    def sample(self,
-               assign_result,
-               bboxes,
-               gt_bboxes,
-               gt_labels=None,
-               **kwargs):
-        """Sample positive and negative bboxes.
-
-        This is a simple implementation of bbox sampling given candidates,
-        assigning results and ground truth bboxes.
-
-        Args:
-            assign_result (:obj:`AssignResult`): Bbox assigning results.
-            bboxes (torch.Tensor): Boxes to be sampled from.
-            gt_bboxes (torch.Tensor): Ground truth bboxes.
-            gt_labels (torch.Tensor, optional): Class labels of ground truth
-                bboxes.
-
-        Returns:
-            :obj:`SamplingResult`: Sampling result.
-        """
-        if len(bboxes.shape) < 2:
-            bboxes = bboxes[None, :]
-
-        gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.bool)
-        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
-            if gt_labels is None:
-                raise ValueError(
-                    'gt_labels must be given when add_gt_as_proposals is True')
-            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
-            assign_result.add_gt_(gt_labels)
-            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.bool)
-            gt_flags = torch.cat([gt_ones, gt_flags])
-
-        num_expected_pos = int(self.num * self.pos_fraction)
-        pos_inds = self.pos_sampler._sample_pos(
-            assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
-        # We found that sampled indices have duplicated items occasionally.
-        # (may be a bug of PyTorch)
-        pos_inds = pos_inds.unique()
-        num_sampled_pos = pos_inds.numel()
-        num_expected_neg = self.num - num_sampled_pos
-        if self.neg_pos_ub >= 0:
-            _pos = max(1, num_sampled_pos)
-            neg_upper_bound = int(self.neg_pos_ub * _pos)
-            if num_expected_neg > neg_upper_bound:
-                num_expected_neg = neg_upper_bound
-        neg_inds = self.neg_sampler._sample_neg(
-            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
-
-        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
-                                         assign_result, gt_flags)
-        if self.return_iou:
-            # PartA2 needs iou score to regression.
-            sampling_result.iou = assign_result.max_overlaps[torch.cat(
-                [pos_inds, neg_inds])]
-            sampling_result.iou.detach_()
-
-        return sampling_result
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet.core.bbox.builder import BBOX_SAMPLERS
+from . import RandomSampler, SamplingResult
+
+
+@BBOX_SAMPLERS.register_module()
+class IoUNegPiecewiseSampler(RandomSampler):
+    """IoU Piece-wise Sampling.
+
+    Sampling negative proposals according to a list of IoU thresholds.
+    The negative proposals are divided into several pieces according
+    to `neg_iou_piece_thrs`. And the ratio of each piece is indicated
+    by `neg_piece_fractions`.
+
+    Args:
+        num (int): Number of proposals.
+        pos_fraction (float): The fraction of positive proposals.
+        neg_piece_fractions (list): A list contains fractions that indicates
+            the ratio of each piece of total negative samplers.
+        neg_iou_piece_thrs (list): A list contains IoU thresholds that
+            indicate the upper bound of this piece.
+        neg_pos_ub (float): The total ratio to limit the upper bound
+            number of negative samples.
+        add_gt_as_proposals (bool): Whether to add gt as proposals.
+    """
+
+    def __init__(self,
+                 num,
+                 pos_fraction=None,
+                 neg_piece_fractions=None,
+                 neg_iou_piece_thrs=None,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=False,
+                 return_iou=False):
+        super(IoUNegPiecewiseSampler,
+              self).__init__(num, pos_fraction, neg_pos_ub,
+                             add_gt_as_proposals)
+        assert isinstance(neg_piece_fractions, list)
+        assert len(neg_piece_fractions) == len(neg_iou_piece_thrs)
+        self.neg_piece_fractions = neg_piece_fractions
+        self.neg_iou_thr = neg_iou_piece_thrs
+        self.return_iou = return_iou
+        self.neg_piece_num = len(self.neg_piece_fractions)
+
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some positive samples."""
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
+        if pos_inds.numel() != 0:
+            pos_inds = pos_inds.squeeze(1)
+        if pos_inds.numel() <= num_expected:
+            return pos_inds
+        else:
+            return self.random_choice(pos_inds, num_expected)
+
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Randomly sample some negative samples."""
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
+        if neg_inds.numel() != 0:
+            neg_inds = neg_inds.squeeze(1)
+        if len(neg_inds) <= 0:
+            return neg_inds.squeeze(1)
+        else:
+            neg_inds_choice = neg_inds.new_zeros([0])
+            extend_num = 0
+            max_overlaps = assign_result.max_overlaps[neg_inds]
+
+            for piece_inds in range(self.neg_piece_num):
+                if piece_inds == self.neg_piece_num - 1:  # for the last piece
+                    piece_expected_num = num_expected - len(neg_inds_choice)
+                    min_iou_thr = 0
+                else:
+                    # if the numbers of negative samplers in previous
+                    # pieces are less than the expected number, extend
+                    # the same number in the current piece.
+                    piece_expected_num = int(
+                        num_expected *
+                        self.neg_piece_fractions[piece_inds]) + extend_num
+                    min_iou_thr = self.neg_iou_thr[piece_inds + 1]
+                max_iou_thr = self.neg_iou_thr[piece_inds]
+                piece_neg_inds = torch.nonzero(
+                    (max_overlaps >= min_iou_thr)
+                    & (max_overlaps < max_iou_thr),
+                    as_tuple=False).view(-1)
+
+                if len(piece_neg_inds) < piece_expected_num:
+                    neg_inds_choice = torch.cat(
+                        [neg_inds_choice, neg_inds[piece_neg_inds]], dim=0)
+                    extend_num += piece_expected_num - len(piece_neg_inds)
+
+                    # for the last piece
+                    if piece_inds == self.neg_piece_num - 1:
+                        extend_neg_num = num_expected - len(neg_inds_choice)
+                        # if the numbers of nagetive samples > 0, we will
+                        # randomly select num_expected samples in last piece
+                        if piece_neg_inds.numel() > 0:
+                            rand_idx = torch.randint(
+                                low=0,
+                                high=piece_neg_inds.numel(),
+                                size=(extend_neg_num, )).long()
+                            neg_inds_choice = torch.cat(
+                                [neg_inds_choice, piece_neg_inds[rand_idx]],
+                                dim=0)
+                        # if the numbers of nagetive samples == 0, we will
+                        # randomly select num_expected samples in all
+                        # previous pieces
+                        else:
+                            rand_idx = torch.randint(
+                                low=0,
+                                high=neg_inds_choice.numel(),
+                                size=(extend_neg_num, )).long()
+                            neg_inds_choice = torch.cat(
+                                [neg_inds_choice, neg_inds_choice[rand_idx]],
+                                dim=0)
+                else:
+                    piece_choice = self.random_choice(piece_neg_inds,
+                                                      piece_expected_num)
+                    neg_inds_choice = torch.cat(
+                        [neg_inds_choice, neg_inds[piece_choice]], dim=0)
+                    extend_num = 0
+            assert len(neg_inds_choice) == num_expected
+            return neg_inds_choice
+
+    def sample(self,
+               assign_result,
+               bboxes,
+               gt_bboxes,
+               gt_labels=None,
+               **kwargs):
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            bboxes (torch.Tensor): Boxes to be sampled from.
+            gt_bboxes (torch.Tensor): Ground truth bboxes.
+            gt_labels (torch.Tensor, optional): Class labels of ground truth
+                bboxes.
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+        """
+        if len(bboxes.shape) < 2:
+            bboxes = bboxes[None, :]
+
+        gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.bool)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            if gt_labels is None:
+                raise ValueError(
+                    'gt_labels must be given when add_gt_as_proposals is True')
+            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.bool)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        if self.return_iou:
+            # PartA2 needs iou score to regression.
+            sampling_result.iou = assign_result.max_overlaps[torch.cat(
+                [pos_inds, neg_inds])]
+            sampling_result.iou.detach_()
+
+        return sampling_result
diff --git a/mmdet3d/core/bbox/structures/__init__.py b/mmdet3d/core/bbox/structures/__init__.py
index 460035a..4549071 100644
--- a/mmdet3d/core/bbox/structures/__init__.py
+++ b/mmdet3d/core/bbox/structures/__init__.py
@@ -1,18 +1,18 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base_box3d import BaseInstance3DBoxes
-from .box_3d_mode import Box3DMode
-from .cam_box3d import CameraInstance3DBoxes
-from .coord_3d_mode import Coord3DMode
-from .depth_box3d import DepthInstance3DBoxes
-from .lidar_box3d import LiDARInstance3DBoxes
-from .utils import (get_box_type, get_proj_mat_by_coord_type, limit_period,
-                    mono_cam_box2vis, points_cam2img, points_img2cam,
-                    rotation_3d_in_axis, xywhr2xyxyr)
-
-__all__ = [
-    'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes',
-    'CameraInstance3DBoxes', 'DepthInstance3DBoxes', 'xywhr2xyxyr',
-    'get_box_type', 'rotation_3d_in_axis', 'limit_period', 'points_cam2img',
-    'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis',
-    'get_proj_mat_by_coord_type'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_box3d import BaseInstance3DBoxes
+from .box_3d_mode import Box3DMode
+from .cam_box3d import CameraInstance3DBoxes
+from .coord_3d_mode import Coord3DMode
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+from .utils import (get_box_type, get_proj_mat_by_coord_type, limit_period,
+                    mono_cam_box2vis, points_cam2img, points_img2cam,
+                    rotation_3d_in_axis, xywhr2xyxyr)
+
+__all__ = [
+    'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes',
+    'CameraInstance3DBoxes', 'DepthInstance3DBoxes', 'xywhr2xyxyr',
+    'get_box_type', 'rotation_3d_in_axis', 'limit_period', 'points_cam2img',
+    'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis',
+    'get_proj_mat_by_coord_type'
+]
diff --git a/mmdet3d/core/bbox/structures/base_box3d.py b/mmdet3d/core/bbox/structures/base_box3d.py
index 3c74f67..7e8b016 100644
--- a/mmdet3d/core/bbox/structures/base_box3d.py
+++ b/mmdet3d/core/bbox/structures/base_box3d.py
@@ -1,578 +1,578 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from abc import abstractmethod
-
-import numpy as np
-import torch
-from mmcv.ops import box_iou_rotated, points_in_boxes_all, points_in_boxes_part
-
-from .utils import limit_period
-
-
-class BaseInstance3DBoxes(object):
-    """Base class for 3D Boxes.
-
-    Note:
-        The box is bottom centered, i.e. the relative position of origin in
-        the box is (0.5, 0.5, 0).
-
-    Args:
-        tensor (torch.Tensor | np.ndarray | list): a N x box_dim matrix.
-        box_dim (int): Number of the dimension of a box.
-            Each row is (x, y, z, x_size, y_size, z_size, yaw).
-            Defaults to 7.
-        with_yaw (bool): Whether the box is with yaw rotation.
-            If False, the value of yaw will be set to 0 as minmax boxes.
-            Defaults to True.
-        origin (tuple[float], optional): Relative position of the box origin.
-            Defaults to (0.5, 0.5, 0). This will guide the box be converted to
-            (0.5, 0.5, 0) mode.
-
-    Attributes:
-        tensor (torch.Tensor): Float matrix of N x box_dim.
-        box_dim (int): Integer indicating the dimension of a box.
-            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
-        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
-            boxes.
-    """
-
-    def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)):
-        if isinstance(tensor, torch.Tensor):
-            device = tensor.device
-        else:
-            device = torch.device('cpu')
-        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
-        if tensor.numel() == 0:
-            # Use reshape, so we don't end up creating a new tensor that
-            # does not depend on the inputs (and consequently confuses jit)
-            tensor = tensor.reshape((0, box_dim)).to(
-                dtype=torch.float32, device=device)
-        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
-
-        if tensor.shape[-1] == 6:
-            # If the dimension of boxes is 6, we expand box_dim by padding
-            # 0 as a fake yaw and set with_yaw to False.
-            assert box_dim == 6
-            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
-            tensor = torch.cat((tensor, fake_rot), dim=-1)
-            self.box_dim = box_dim + 1
-            self.with_yaw = False
-        else:
-            self.box_dim = box_dim
-            self.with_yaw = with_yaw
-        self.tensor = tensor.clone()
-
-        if origin != (0.5, 0.5, 0):
-            dst = self.tensor.new_tensor((0.5, 0.5, 0))
-            src = self.tensor.new_tensor(origin)
-            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
-
-    @property
-    def volume(self):
-        """torch.Tensor: A vector with volume of each box."""
-        return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5]
-
-    @property
-    def dims(self):
-        """torch.Tensor: Size dimensions of each box in shape (N, 3)."""
-        return self.tensor[:, 3:6]
-
-    @property
-    def yaw(self):
-        """torch.Tensor: A vector with yaw of each box in shape (N, )."""
-        return self.tensor[:, 6]
-
-    @property
-    def height(self):
-        """torch.Tensor: A vector with height of each box in shape (N, )."""
-        return self.tensor[:, 5]
-
-    @property
-    def top_height(self):
-        """torch.Tensor:
-            A vector with the top height of each box in shape (N, )."""
-        return self.bottom_height + self.height
-
-    @property
-    def bottom_height(self):
-        """torch.Tensor:
-            A vector with bottom's height of each box in shape (N, )."""
-        return self.tensor[:, 2]
-
-    @property
-    def center(self):
-        """Calculate the center of all the boxes.
-
-        Note:
-            In MMDetection3D's convention, the bottom center is
-            usually taken as the default center.
-
-            The relative position of the centers in different kinds of
-            boxes are different, e.g., the relative center of a boxes is
-            (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
-            It is recommended to use ``bottom_center`` or ``gravity_center``
-            for clearer usage.
-
-        Returns:
-            torch.Tensor: A tensor with center of each box in shape (N, 3).
-        """
-        return self.bottom_center
-
-    @property
-    def bottom_center(self):
-        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
-        return self.tensor[:, :3]
-
-    @property
-    def gravity_center(self):
-        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
-        pass
-
-    @property
-    def corners(self):
-        """torch.Tensor:
-            a tensor with 8 corners of each box in shape (N, 8, 3)."""
-        pass
-
-    @property
-    def bev(self):
-        """torch.Tensor: 2D BEV box of each box with rotation
-            in XYWHR format, in shape (N, 5)."""
-        return self.tensor[:, [0, 1, 3, 4, 6]]
-
-    @property
-    def nearest_bev(self):
-        """torch.Tensor: A tensor of 2D BEV box of each box
-            without rotation."""
-        # Obtain BEV boxes with rotation in XYWHR format
-        bev_rotated_boxes = self.bev
-        # convert the rotation to a valid range
-        rotations = bev_rotated_boxes[:, -1]
-        normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
-
-        # find the center of boxes
-        conditions = (normed_rotations > np.pi / 4)[..., None]
-        bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
-                                                                [0, 1, 3, 2]],
-                                  bev_rotated_boxes[:, :4])
-
-        centers = bboxes_xywh[:, :2]
-        dims = bboxes_xywh[:, 2:]
-        bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
-        return bev_boxes
-
-    def in_range_bev(self, box_range):
-        """Check whether the boxes are in the given range.
-
-        Args:
-            box_range (list | torch.Tensor): the range of box
-                (x_min, y_min, x_max, y_max)
-
-        Note:
-            The original implementation of SECOND checks whether boxes in
-            a range by checking whether the points are in a convex
-            polygon, we reduce the burden for simpler cases.
-
-        Returns:
-            torch.Tensor: Whether each box is inside the reference range.
-        """
-        in_range_flags = ((self.bev[:, 0] > box_range[0])
-                          & (self.bev[:, 1] > box_range[1])
-                          & (self.bev[:, 0] < box_range[2])
-                          & (self.bev[:, 1] < box_range[3]))
-        return in_range_flags
-
-    @abstractmethod
-    def rotate(self, angle, points=None):
-        """Rotate boxes with points (optional) with the given angle or rotation
-        matrix.
-
-        Args:
-            angle (float | torch.Tensor | np.ndarray):
-                Rotation angle or rotation matrix.
-            points (torch.Tensor | numpy.ndarray |
-                :obj:`BasePoints`, optional):
-                Points to rotate. Defaults to None.
-        """
-        pass
-
-    @abstractmethod
-    def flip(self, bev_direction='horizontal'):
-        """Flip the boxes in BEV along given BEV direction.
-
-        Args:
-            bev_direction (str, optional): Direction by which to flip.
-                Can be chosen from 'horizontal' and 'vertical'.
-                Defaults to 'horizontal'.
-        """
-        pass
-
-    def translate(self, trans_vector):
-        """Translate boxes with the given translation vector.
-
-        Args:
-            trans_vector (torch.Tensor): Translation vector of size (1, 3).
-        """
-        if not isinstance(trans_vector, torch.Tensor):
-            trans_vector = self.tensor.new_tensor(trans_vector)
-        self.tensor[:, :3] += trans_vector
-
-    def in_range_3d(self, box_range):
-        """Check whether the boxes are in the given range.
-
-        Args:
-            box_range (list | torch.Tensor): The range of box
-                (x_min, y_min, z_min, x_max, y_max, z_max)
-
-        Note:
-            In the original implementation of SECOND, checking whether
-            a box in the range checks whether the points are in a convex
-            polygon, we try to reduce the burden for simpler cases.
-
-        Returns:
-            torch.Tensor: A binary vector indicating whether each box is
-                inside the reference range.
-        """
-        in_range_flags = ((self.tensor[:, 0] > box_range[0])
-                          & (self.tensor[:, 1] > box_range[1])
-                          & (self.tensor[:, 2] > box_range[2])
-                          & (self.tensor[:, 0] < box_range[3])
-                          & (self.tensor[:, 1] < box_range[4])
-                          & (self.tensor[:, 2] < box_range[5]))
-        return in_range_flags
-
-    @abstractmethod
-    def convert_to(self, dst, rt_mat=None):
-        """Convert self to ``dst`` mode.
-
-        Args:
-            dst (:obj:`Box3DMode`): The target Box mode.
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from `src` coordinates to `dst` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-
-        Returns:
-            :obj:`BaseInstance3DBoxes`: The converted box of the same type
-                in the `dst` mode.
-        """
-        pass
-
-    def scale(self, scale_factor):
-        """Scale the box with horizontal and vertical scaling factors.
-
-        Args:
-            scale_factors (float): Scale factors to scale the boxes.
-        """
-        self.tensor[:, :6] *= scale_factor
-        self.tensor[:, 7:] *= scale_factor  # velocity
-
-    def limit_yaw(self, offset=0.5, period=np.pi):
-        """Limit the yaw to a given period and offset.
-
-        Args:
-            offset (float, optional): The offset of the yaw. Defaults to 0.5.
-            period (float, optional): The expected period. Defaults to np.pi.
-        """
-        self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period)
-
-    def nonempty(self, threshold=0.0):
-        """Find boxes that are non-empty.
-
-        A box is considered empty,
-        if either of its side is no larger than threshold.
-
-        Args:
-            threshold (float, optional): The threshold of minimal sizes.
-                Defaults to 0.0.
-
-        Returns:
-            torch.Tensor: A binary vector which represents whether each
-                box is empty (False) or non-empty (True).
-        """
-        box = self.tensor
-        size_x = box[..., 3]
-        size_y = box[..., 4]
-        size_z = box[..., 5]
-        keep = ((size_x > threshold)
-                & (size_y > threshold) & (size_z > threshold))
-        return keep
-
-    def __getitem__(self, item):
-        """
-        Note:
-            The following usage are allowed:
-            1. `new_boxes = boxes[3]`:
-                return a `Boxes` that contains only one box.
-            2. `new_boxes = boxes[2:10]`:
-                return a slice of boxes.
-            3. `new_boxes = boxes[vector]`:
-                where vector is a torch.BoolTensor with `length = len(boxes)`.
-                Nonzero elements in the vector will be selected.
-            Note that the returned Boxes might share storage with this Boxes,
-            subject to Pytorch's indexing semantics.
-
-        Returns:
-            :obj:`BaseInstance3DBoxes`: A new object of
-                :class:`BaseInstance3DBoxes` after indexing.
-        """
-        original_type = type(self)
-        if isinstance(item, int):
-            return original_type(
-                self.tensor[item].view(1, -1),
-                box_dim=self.box_dim,
-                with_yaw=self.with_yaw)
-        b = self.tensor[item]
-        assert b.dim() == 2, \
-            f'Indexing on Boxes with {item} failed to return a matrix!'
-        return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw)
-
-    def __len__(self):
-        """int: Number of boxes in the current object."""
-        return self.tensor.shape[0]
-
-    def __repr__(self):
-        """str: Return a strings that describes the object."""
-        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'
-
-    @classmethod
-    def cat(cls, boxes_list):
-        """Concatenate a list of Boxes into a single Boxes.
-
-        Args:
-            boxes_list (list[:obj:`BaseInstance3DBoxes`]): List of boxes.
-
-        Returns:
-            :obj:`BaseInstance3DBoxes`: The concatenated Boxes.
-        """
-        assert isinstance(boxes_list, (list, tuple))
-        if len(boxes_list) == 0:
-            return cls(torch.empty(0))
-        assert all(isinstance(box, cls) for box in boxes_list)
-
-        # use torch.cat (v.s. layers.cat)
-        # so the returned boxes never share storage with input
-        cat_boxes = cls(
-            torch.cat([b.tensor for b in boxes_list], dim=0),
-            box_dim=boxes_list[0].tensor.shape[1],
-            with_yaw=boxes_list[0].with_yaw)
-        return cat_boxes
-
-    def to(self, device):
-        """Convert current boxes to a specific device.
-
-        Args:
-            device (str | :obj:`torch.device`): The name of the device.
-
-        Returns:
-            :obj:`BaseInstance3DBoxes`: A new boxes object on the
-                specific device.
-        """
-        original_type = type(self)
-        return original_type(
-            self.tensor.to(device),
-            box_dim=self.box_dim,
-            with_yaw=self.with_yaw)
-
-    def clone(self):
-        """Clone the Boxes.
-
-        Returns:
-            :obj:`BaseInstance3DBoxes`: Box object with the same properties
-                as self.
-        """
-        original_type = type(self)
-        return original_type(
-            self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw)
-
-    @property
-    def device(self):
-        """str: The device of the boxes are on."""
-        return self.tensor.device
-
-    def __iter__(self):
-        """Yield a box as a Tensor of shape (4,) at a time.
-
-        Returns:
-            torch.Tensor: A box of shape (4,).
-        """
-        yield from self.tensor
-
-    @classmethod
-    def height_overlaps(cls, boxes1, boxes2, mode='iou'):
-        """Calculate height overlaps of two boxes.
-
-        Note:
-            This function calculates the height overlaps between boxes1 and
-            boxes2,  boxes1 and boxes2 should be in the same type.
-
-        Args:
-            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
-            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
-            mode (str, optional): Mode of IoU calculation. Defaults to 'iou'.
-
-        Returns:
-            torch.Tensor: Calculated iou of boxes.
-        """
-        assert isinstance(boxes1, BaseInstance3DBoxes)
-        assert isinstance(boxes2, BaseInstance3DBoxes)
-        assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
-            f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
-
-        boxes1_top_height = boxes1.top_height.view(-1, 1)
-        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
-        boxes2_top_height = boxes2.top_height.view(1, -1)
-        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
-
-        heighest_of_bottom = torch.max(boxes1_bottom_height,
-                                       boxes2_bottom_height)
-        lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height)
-        overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0)
-        return overlaps_h
-
-    @classmethod
-    def overlaps(cls, boxes1, boxes2, mode='iou'):
-        """Calculate 3D overlaps of two boxes.
-
-        Note:
-            This function calculates the overlaps between ``boxes1`` and
-            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
-
-        Args:
-            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
-            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
-            mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
-
-        Returns:
-            torch.Tensor: Calculated 3D overlaps of the boxes.
-        """
-        assert isinstance(boxes1, BaseInstance3DBoxes)
-        assert isinstance(boxes2, BaseInstance3DBoxes)
-        assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
-            f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
-
-        assert mode in ['iou', 'iof']
-
-        rows = len(boxes1)
-        cols = len(boxes2)
-        if rows * cols == 0:
-            return boxes1.tensor.new(rows, cols)
-
-        # height overlap
-        overlaps_h = cls.height_overlaps(boxes1, boxes2)
-
-        # bev overlap
-        iou2d = box_iou_rotated(boxes1.bev, boxes2.bev)
-        areas1 = (boxes1.bev[:, 2] * boxes1.bev[:, 3]).unsqueeze(1).expand(
-            rows, cols)
-        areas2 = (boxes2.bev[:, 2] * boxes2.bev[:, 3]).unsqueeze(0).expand(
-            rows, cols)
-        overlaps_bev = iou2d * (areas1 + areas2) / (1 + iou2d)
-
-        # 3d overlaps
-        overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h
-
-        volume1 = boxes1.volume.view(-1, 1)
-        volume2 = boxes2.volume.view(1, -1)
-
-        if mode == 'iou':
-            # the clamp func is used to avoid division of 0
-            iou3d = overlaps_3d / torch.clamp(
-                volume1 + volume2 - overlaps_3d, min=1e-8)
-        else:
-            iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8)
-
-        return iou3d
-
-    def new_box(self, data):
-        """Create a new box object with data.
-
-        The new box and its tensor has the similar properties
-            as self and self.tensor, respectively.
-
-        Args:
-            data (torch.Tensor | numpy.array | list): Data to be copied.
-
-        Returns:
-            :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``,
-                the object's other properties are similar to ``self``.
-        """
-        new_tensor = self.tensor.new_tensor(data) \
-            if not isinstance(data, torch.Tensor) else data.to(self.device)
-        original_type = type(self)
-        return original_type(
-            new_tensor, box_dim=self.box_dim, with_yaw=self.with_yaw)
-
-    def points_in_boxes_part(self, points, boxes_override=None):
-        """Find the box in which each point is.
-
-        Args:
-            points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
-                3 dimensions are (x, y, z) in LiDAR or depth coordinate.
-            boxes_override (torch.Tensor, optional): Boxes to override
-                `self.tensor`. Defaults to None.
-
-        Returns:
-            torch.Tensor: The index of the first box that each point
-                is in, in shape (M, ). Default value is -1
-                (if the point is not enclosed by any box).
-
-        Note:
-            If a point is enclosed by multiple boxes, the index of the
-            first box will be returned.
-        """
-        if boxes_override is not None:
-            boxes = boxes_override
-        else:
-            boxes = self.tensor
-        if points.dim() == 2:
-            points = points.unsqueeze(0)
-        box_idx = points_in_boxes_part(points,
-                                       boxes.unsqueeze(0).to(
-                                           points.device)).squeeze(0)
-        return box_idx
-
-    def points_in_boxes_all(self, points, boxes_override=None):
-        """Find all boxes in which each point is.
-
-        Args:
-            points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
-                3 dimensions are (x, y, z) in LiDAR or depth coordinate.
-            boxes_override (torch.Tensor, optional): Boxes to override
-                `self.tensor`. Defaults to None.
-
-        Returns:
-            torch.Tensor: A tensor indicating whether a point is in a box,
-                in shape (M, T). T is the number of boxes. Denote this
-                tensor as A, if the m^th point is in the t^th box, then
-                `A[m, t] == 1`, elsewise `A[m, t] == 0`.
-        """
-        if boxes_override is not None:
-            boxes = boxes_override
-        else:
-            boxes = self.tensor
-
-        points_clone = points.clone()[..., :3]
-        if points_clone.dim() == 2:
-            points_clone = points_clone.unsqueeze(0)
-        else:
-            assert points_clone.dim() == 3 and points_clone.shape[0] == 1
-
-        boxes = boxes.to(points_clone.device).unsqueeze(0)
-        box_idxs_of_pts = points_in_boxes_all(points_clone, boxes)
-
-        return box_idxs_of_pts.squeeze(0)
-
-    def points_in_boxes(self, points, boxes_override=None):
-        warnings.warn('DeprecationWarning: points_in_boxes is a '
-                      'deprecated method, please consider using '
-                      'points_in_boxes_part.')
-        return self.points_in_boxes_part(points, boxes_override)
-
-    def points_in_boxes_batch(self, points, boxes_override=None):
-        warnings.warn('DeprecationWarning: points_in_boxes_batch is a '
-                      'deprecated method, please consider using '
-                      'points_in_boxes_all.')
-        return self.points_in_boxes_all(points, boxes_override)
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from abc import abstractmethod
+
+import numpy as np
+import torch
+from mmcv.ops import box_iou_rotated, points_in_boxes_all, points_in_boxes_part
+
+from .utils import limit_period
+
+
+class BaseInstance3DBoxes(object):
+    """Base class for 3D Boxes.
+
+    Note:
+        The box is bottom centered, i.e. the relative position of origin in
+        the box is (0.5, 0.5, 0).
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x box_dim matrix.
+        box_dim (int): Number of the dimension of a box.
+            Each row is (x, y, z, x_size, y_size, z_size, yaw).
+            Defaults to 7.
+        with_yaw (bool): Whether the box is with yaw rotation.
+            If False, the value of yaw will be set to 0 as minmax boxes.
+            Defaults to True.
+        origin (tuple[float], optional): Relative position of the box origin.
+            Defaults to (0.5, 0.5, 0). This will guide the box be converted to
+            (0.5, 0.5, 0) mode.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicating the dimension of a box.
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)):
+        if isinstance(tensor, torch.Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that
+            # does not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, box_dim)).to(
+                dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
+
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding
+            # 0 as a fake yaw and set with_yaw to False.
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 1
+            self.with_yaw = False
+        else:
+            self.box_dim = box_dim
+            self.with_yaw = with_yaw
+        self.tensor = tensor.clone()
+
+        if origin != (0.5, 0.5, 0):
+            dst = self.tensor.new_tensor((0.5, 0.5, 0))
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+    @property
+    def volume(self):
+        """torch.Tensor: A vector with volume of each box."""
+        return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5]
+
+    @property
+    def dims(self):
+        """torch.Tensor: Size dimensions of each box in shape (N, 3)."""
+        return self.tensor[:, 3:6]
+
+    @property
+    def yaw(self):
+        """torch.Tensor: A vector with yaw of each box in shape (N, )."""
+        return self.tensor[:, 6]
+
+    @property
+    def height(self):
+        """torch.Tensor: A vector with height of each box in shape (N, )."""
+        return self.tensor[:, 5]
+
+    @property
+    def top_height(self):
+        """torch.Tensor:
+            A vector with the top height of each box in shape (N, )."""
+        return self.bottom_height + self.height
+
+    @property
+    def bottom_height(self):
+        """torch.Tensor:
+            A vector with bottom's height of each box in shape (N, )."""
+        return self.tensor[:, 2]
+
+    @property
+    def center(self):
+        """Calculate the center of all the boxes.
+
+        Note:
+            In MMDetection3D's convention, the bottom center is
+            usually taken as the default center.
+
+            The relative position of the centers in different kinds of
+            boxes are different, e.g., the relative center of a boxes is
+            (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
+            It is recommended to use ``bottom_center`` or ``gravity_center``
+            for clearer usage.
+
+        Returns:
+            torch.Tensor: A tensor with center of each box in shape (N, 3).
+        """
+        return self.bottom_center
+
+    @property
+    def bottom_center(self):
+        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
+        return self.tensor[:, :3]
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
+        pass
+
+    @property
+    def corners(self):
+        """torch.Tensor:
+            a tensor with 8 corners of each box in shape (N, 8, 3)."""
+        pass
+
+    @property
+    def bev(self):
+        """torch.Tensor: 2D BEV box of each box with rotation
+            in XYWHR format, in shape (N, 5)."""
+        return self.tensor[:, [0, 1, 3, 4, 6]]
+
+    @property
+    def nearest_bev(self):
+        """torch.Tensor: A tensor of 2D BEV box of each box
+            without rotation."""
+        # Obtain BEV boxes with rotation in XYWHR format
+        bev_rotated_boxes = self.bev
+        # convert the rotation to a valid range
+        rotations = bev_rotated_boxes[:, -1]
+        normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
+
+        # find the center of boxes
+        conditions = (normed_rotations > np.pi / 4)[..., None]
+        bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
+                                                                [0, 1, 3, 2]],
+                                  bev_rotated_boxes[:, :4])
+
+        centers = bboxes_xywh[:, :2]
+        dims = bboxes_xywh[:, 2:]
+        bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+        return bev_boxes
+
+    def in_range_bev(self, box_range):
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (list | torch.Tensor): the range of box
+                (x_min, y_min, x_max, y_max)
+
+        Note:
+            The original implementation of SECOND checks whether boxes in
+            a range by checking whether the points are in a convex
+            polygon, we reduce the burden for simpler cases.
+
+        Returns:
+            torch.Tensor: Whether each box is inside the reference range.
+        """
+        in_range_flags = ((self.bev[:, 0] > box_range[0])
+                          & (self.bev[:, 1] > box_range[1])
+                          & (self.bev[:, 0] < box_range[2])
+                          & (self.bev[:, 1] < box_range[3]))
+        return in_range_flags
+
+    @abstractmethod
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor | numpy.ndarray |
+                :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+        """
+        pass
+
+    @abstractmethod
+    def flip(self, bev_direction='horizontal'):
+        """Flip the boxes in BEV along given BEV direction.
+
+        Args:
+            bev_direction (str, optional): Direction by which to flip.
+                Can be chosen from 'horizontal' and 'vertical'.
+                Defaults to 'horizontal'.
+        """
+        pass
+
+    def translate(self, trans_vector):
+        """Translate boxes with the given translation vector.
+
+        Args:
+            trans_vector (torch.Tensor): Translation vector of size (1, 3).
+        """
+        if not isinstance(trans_vector, torch.Tensor):
+            trans_vector = self.tensor.new_tensor(trans_vector)
+        self.tensor[:, :3] += trans_vector
+
+    def in_range_3d(self, box_range):
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (list | torch.Tensor): The range of box
+                (x_min, y_min, z_min, x_max, y_max, z_max)
+
+        Note:
+            In the original implementation of SECOND, checking whether
+            a box in the range checks whether the points are in a convex
+            polygon, we try to reduce the burden for simpler cases.
+
+        Returns:
+            torch.Tensor: A binary vector indicating whether each box is
+                inside the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > box_range[0])
+                          & (self.tensor[:, 1] > box_range[1])
+                          & (self.tensor[:, 2] > box_range[2])
+                          & (self.tensor[:, 0] < box_range[3])
+                          & (self.tensor[:, 1] < box_range[4])
+                          & (self.tensor[:, 2] < box_range[5]))
+        return in_range_flags
+
+    @abstractmethod
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`Box3DMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type
+                in the `dst` mode.
+        """
+        pass
+
+    def scale(self, scale_factor):
+        """Scale the box with horizontal and vertical scaling factors.
+
+        Args:
+            scale_factors (float): Scale factors to scale the boxes.
+        """
+        self.tensor[:, :6] *= scale_factor
+        self.tensor[:, 7:] *= scale_factor  # velocity
+
+    def limit_yaw(self, offset=0.5, period=np.pi):
+        """Limit the yaw to a given period and offset.
+
+        Args:
+            offset (float, optional): The offset of the yaw. Defaults to 0.5.
+            period (float, optional): The expected period. Defaults to np.pi.
+        """
+        self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period)
+
+    def nonempty(self, threshold=0.0):
+        """Find boxes that are non-empty.
+
+        A box is considered empty,
+        if either of its side is no larger than threshold.
+
+        Args:
+            threshold (float, optional): The threshold of minimal sizes.
+                Defaults to 0.0.
+
+        Returns:
+            torch.Tensor: A binary vector which represents whether each
+                box is empty (False) or non-empty (True).
+        """
+        box = self.tensor
+        size_x = box[..., 3]
+        size_y = box[..., 4]
+        size_z = box[..., 5]
+        keep = ((size_x > threshold)
+                & (size_y > threshold) & (size_z > threshold))
+        return keep
+
+    def __getitem__(self, item):
+        """
+        Note:
+            The following usage are allowed:
+            1. `new_boxes = boxes[3]`:
+                return a `Boxes` that contains only one box.
+            2. `new_boxes = boxes[2:10]`:
+                return a slice of boxes.
+            3. `new_boxes = boxes[vector]`:
+                where vector is a torch.BoolTensor with `length = len(boxes)`.
+                Nonzero elements in the vector will be selected.
+            Note that the returned Boxes might share storage with this Boxes,
+            subject to Pytorch's indexing semantics.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new object of
+                :class:`BaseInstance3DBoxes` after indexing.
+        """
+        original_type = type(self)
+        if isinstance(item, int):
+            return original_type(
+                self.tensor[item].view(1, -1),
+                box_dim=self.box_dim,
+                with_yaw=self.with_yaw)
+        b = self.tensor[item]
+        assert b.dim() == 2, \
+            f'Indexing on Boxes with {item} failed to return a matrix!'
+        return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    def __len__(self):
+        """int: Number of boxes in the current object."""
+        return self.tensor.shape[0]
+
+    def __repr__(self):
+        """str: Return a strings that describes the object."""
+        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'
+
+    @classmethod
+    def cat(cls, boxes_list):
+        """Concatenate a list of Boxes into a single Boxes.
+
+        Args:
+            boxes_list (list[:obj:`BaseInstance3DBoxes`]): List of boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The concatenated Boxes.
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all(isinstance(box, cls) for box in boxes_list)
+
+        # use torch.cat (v.s. layers.cat)
+        # so the returned boxes never share storage with input
+        cat_boxes = cls(
+            torch.cat([b.tensor for b in boxes_list], dim=0),
+            box_dim=boxes_list[0].tensor.shape[1],
+            with_yaw=boxes_list[0].with_yaw)
+        return cat_boxes
+
+    def to(self, device):
+        """Convert current boxes to a specific device.
+
+        Args:
+            device (str | :obj:`torch.device`): The name of the device.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new boxes object on the
+                specific device.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.to(device),
+            box_dim=self.box_dim,
+            with_yaw=self.with_yaw)
+
+    def clone(self):
+        """Clone the Boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: Box object with the same properties
+                as self.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    @property
+    def device(self):
+        """str: The device of the boxes are on."""
+        return self.tensor.device
+
+    def __iter__(self):
+        """Yield a box as a Tensor of shape (4,) at a time.
+
+        Returns:
+            torch.Tensor: A box of shape (4,).
+        """
+        yield from self.tensor
+
+    @classmethod
+    def height_overlaps(cls, boxes1, boxes2, mode='iou'):
+        """Calculate height overlaps of two boxes.
+
+        Note:
+            This function calculates the height overlaps between boxes1 and
+            boxes2,  boxes1 and boxes2 should be in the same type.
+
+        Args:
+            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str, optional): Mode of IoU calculation. Defaults to 'iou'.
+
+        Returns:
+            torch.Tensor: Calculated iou of boxes.
+        """
+        assert isinstance(boxes1, BaseInstance3DBoxes)
+        assert isinstance(boxes2, BaseInstance3DBoxes)
+        assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
+            f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
+
+        boxes1_top_height = boxes1.top_height.view(-1, 1)
+        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+        boxes2_top_height = boxes2.top_height.view(1, -1)
+        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+        heighest_of_bottom = torch.max(boxes1_bottom_height,
+                                       boxes2_bottom_height)
+        lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height)
+        overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0)
+        return overlaps_h
+
+    @classmethod
+    def overlaps(cls, boxes1, boxes2, mode='iou'):
+        """Calculate 3D overlaps of two boxes.
+
+        Note:
+            This function calculates the overlaps between ``boxes1`` and
+            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+        Returns:
+            torch.Tensor: Calculated 3D overlaps of the boxes.
+        """
+        assert isinstance(boxes1, BaseInstance3DBoxes)
+        assert isinstance(boxes2, BaseInstance3DBoxes)
+        assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
+            f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
+
+        assert mode in ['iou', 'iof']
+
+        rows = len(boxes1)
+        cols = len(boxes2)
+        if rows * cols == 0:
+            return boxes1.tensor.new(rows, cols)
+
+        # height overlap
+        overlaps_h = cls.height_overlaps(boxes1, boxes2)
+
+        # bev overlap
+        iou2d = box_iou_rotated(boxes1.bev, boxes2.bev)
+        areas1 = (boxes1.bev[:, 2] * boxes1.bev[:, 3]).unsqueeze(1).expand(
+            rows, cols)
+        areas2 = (boxes2.bev[:, 2] * boxes2.bev[:, 3]).unsqueeze(0).expand(
+            rows, cols)
+        overlaps_bev = iou2d * (areas1 + areas2) / (1 + iou2d)
+
+        # 3d overlaps
+        overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h
+
+        volume1 = boxes1.volume.view(-1, 1)
+        volume2 = boxes2.volume.view(1, -1)
+
+        if mode == 'iou':
+            # the clamp func is used to avoid division of 0
+            iou3d = overlaps_3d / torch.clamp(
+                volume1 + volume2 - overlaps_3d, min=1e-8)
+        else:
+            iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8)
+
+        return iou3d
+
+    def new_box(self, data):
+        """Create a new box object with data.
+
+        The new box and its tensor has the similar properties
+            as self and self.tensor, respectively.
+
+        Args:
+            data (torch.Tensor | numpy.array | list): Data to be copied.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``,
+                the object's other properties are similar to ``self``.
+        """
+        new_tensor = self.tensor.new_tensor(data) \
+            if not isinstance(data, torch.Tensor) else data.to(self.device)
+        original_type = type(self)
+        return original_type(
+            new_tensor, box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    def points_in_boxes_part(self, points, boxes_override=None):
+        """Find the box in which each point is.
+
+        Args:
+            points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
+                3 dimensions are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (torch.Tensor, optional): Boxes to override
+                `self.tensor`. Defaults to None.
+
+        Returns:
+            torch.Tensor: The index of the first box that each point
+                is in, in shape (M, ). Default value is -1
+                (if the point is not enclosed by any box).
+
+        Note:
+            If a point is enclosed by multiple boxes, the index of the
+            first box will be returned.
+        """
+        if boxes_override is not None:
+            boxes = boxes_override
+        else:
+            boxes = self.tensor
+        if points.dim() == 2:
+            points = points.unsqueeze(0)
+        box_idx = points_in_boxes_part(points,
+                                       boxes.unsqueeze(0).to(
+                                           points.device)).squeeze(0)
+        return box_idx
+
+    def points_in_boxes_all(self, points, boxes_override=None):
+        """Find all boxes in which each point is.
+
+        Args:
+            points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
+                3 dimensions are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (torch.Tensor, optional): Boxes to override
+                `self.tensor`. Defaults to None.
+
+        Returns:
+            torch.Tensor: A tensor indicating whether a point is in a box,
+                in shape (M, T). T is the number of boxes. Denote this
+                tensor as A, if the m^th point is in the t^th box, then
+                `A[m, t] == 1`, elsewise `A[m, t] == 0`.
+        """
+        if boxes_override is not None:
+            boxes = boxes_override
+        else:
+            boxes = self.tensor
+
+        points_clone = points.clone()[..., :3]
+        if points_clone.dim() == 2:
+            points_clone = points_clone.unsqueeze(0)
+        else:
+            assert points_clone.dim() == 3 and points_clone.shape[0] == 1
+
+        boxes = boxes.to(points_clone.device).unsqueeze(0)
+        box_idxs_of_pts = points_in_boxes_all(points_clone, boxes)
+
+        return box_idxs_of_pts.squeeze(0)
+
+    def points_in_boxes(self, points, boxes_override=None):
+        warnings.warn('DeprecationWarning: points_in_boxes is a '
+                      'deprecated method, please consider using '
+                      'points_in_boxes_part.')
+        return self.points_in_boxes_part(points, boxes_override)
+
+    def points_in_boxes_batch(self, points, boxes_override=None):
+        warnings.warn('DeprecationWarning: points_in_boxes_batch is a '
+                      'deprecated method, please consider using '
+                      'points_in_boxes_all.')
+        return self.points_in_boxes_all(points, boxes_override)
diff --git a/mmdet3d/core/bbox/structures/box_3d_mode.py b/mmdet3d/core/bbox/structures/box_3d_mode.py
index 3048b0a..2ad0945 100644
--- a/mmdet3d/core/bbox/structures/box_3d_mode.py
+++ b/mmdet3d/core/bbox/structures/box_3d_mode.py
@@ -1,197 +1,197 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from enum import IntEnum, unique
-
-import numpy as np
-import torch
-
-from .base_box3d import BaseInstance3DBoxes
-from .cam_box3d import CameraInstance3DBoxes
-from .depth_box3d import DepthInstance3DBoxes
-from .lidar_box3d import LiDARInstance3DBoxes
-from .utils import limit_period
-
-
-@unique
-class Box3DMode(IntEnum):
-    r"""Enum of different ways to represent a box.
-
-    Coordinates in LiDAR:
-
-    .. code-block:: none
-
-                    up z
-                       ^   x front
-                       |  /
-                       | /
-        left y <------ 0
-
-    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
-    and the yaw is around the z axis, thus the rotation axis=2.
-
-    Coordinates in camera:
-
-    .. code-block:: none
-
-                z front
-               /
-              /
-             0 ------> x right
-             |
-             |
-             v
-        down y
-
-    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
-    and the yaw is around the y axis, thus the rotation axis=1.
-
-    Coordinates in Depth mode:
-
-    .. code-block:: none
-
-        up z
-           ^   y front
-           |  /
-           | /
-           0 ------> x right
-
-    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
-    and the yaw is around the z axis, thus the rotation axis=2.
-    """
-
-    LIDAR = 0
-    CAM = 1
-    DEPTH = 2
-
-    @staticmethod
-    def convert(box, src, dst, rt_mat=None, with_yaw=True):
-        """Convert boxes from `src` mode to `dst` mode.
-
-        Args:
-            box (tuple | list | np.ndarray |
-                torch.Tensor | :obj:`BaseInstance3DBoxes`):
-                Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
-            src (:obj:`Box3DMode`): The src Box mode.
-            dst (:obj:`Box3DMode`): The target Box mode.
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from `src` coordinates to `dst` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-            with_yaw (bool, optional): If `box` is an instance of
-                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
-                Defaults to True.
-
-        Returns:
-            (tuple | list | np.ndarray | torch.Tensor |
-                :obj:`BaseInstance3DBoxes`):
-                The converted box of the same type.
-        """
-        if src == dst:
-            return box
-
-        is_numpy = isinstance(box, np.ndarray)
-        is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
-        single_box = isinstance(box, (list, tuple))
-        if single_box:
-            assert len(box) >= 7, (
-                'Box3DMode.convert takes either a k-tuple/list or '
-                'an Nxk array/tensor, where k >= 7')
-            arr = torch.tensor(box)[None, :]
-        else:
-            # avoid modifying the input box
-            if is_numpy:
-                arr = torch.from_numpy(np.asarray(box)).clone()
-            elif is_Instance3DBoxes:
-                arr = box.tensor.clone()
-            else:
-                arr = box.clone()
-
-        if is_Instance3DBoxes:
-            with_yaw = box.with_yaw
-
-        # convert box from `src` mode to `dst` mode.
-        x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
-        if with_yaw:
-            yaw = arr[..., 6:7]
-        if src == Box3DMode.LIDAR and dst == Box3DMode.CAM:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
-            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
-            if with_yaw:
-                yaw = -yaw - np.pi / 2
-                yaw = limit_period(yaw, period=np.pi * 2)
-        elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
-            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
-            if with_yaw:
-                yaw = -yaw - np.pi / 2
-                yaw = limit_period(yaw, period=np.pi * 2)
-        elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
-            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
-            if with_yaw:
-                yaw = -yaw
-        elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
-            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
-            if with_yaw:
-                yaw = -yaw
-        elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
-            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
-            if with_yaw:
-                yaw = yaw + np.pi / 2
-                yaw = limit_period(yaw, period=np.pi * 2)
-        elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
-            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
-            if with_yaw:
-                yaw = yaw - np.pi / 2
-                yaw = limit_period(yaw, period=np.pi * 2)
-        else:
-            raise NotImplementedError(
-                f'Conversion from Box3DMode {src} to {dst} '
-                'is not supported yet')
-
-        if not isinstance(rt_mat, torch.Tensor):
-            rt_mat = arr.new_tensor(rt_mat)
-        if rt_mat.size(1) == 4:
-            extended_xyz = torch.cat(
-                [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
-            xyz = extended_xyz @ rt_mat.t()
-        else:
-            xyz = arr[..., :3] @ rt_mat.t()
-
-        if with_yaw:
-            remains = arr[..., 7:]
-            arr = torch.cat([xyz[..., :3], xyz_size, yaw, remains], dim=-1)
-        else:
-            remains = arr[..., 6:]
-            arr = torch.cat([xyz[..., :3], xyz_size, remains], dim=-1)
-
-        # convert arr to the original type
-        original_type = type(box)
-        if single_box:
-            return original_type(arr.flatten().tolist())
-        if is_numpy:
-            return arr.numpy()
-        elif is_Instance3DBoxes:
-            if dst == Box3DMode.CAM:
-                target_type = CameraInstance3DBoxes
-            elif dst == Box3DMode.LIDAR:
-                target_type = LiDARInstance3DBoxes
-            elif dst == Box3DMode.DEPTH:
-                target_type = DepthInstance3DBoxes
-            else:
-                raise NotImplementedError(
-                    f'Conversion to {dst} through {original_type}'
-                    ' is not supported yet')
-            return target_type(arr, box_dim=arr.size(-1), with_yaw=with_yaw)
-        else:
-            return arr
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import IntEnum, unique
+
+import numpy as np
+import torch
+
+from .base_box3d import BaseInstance3DBoxes
+from .cam_box3d import CameraInstance3DBoxes
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+from .utils import limit_period
+
+
+@unique
+class Box3DMode(IntEnum):
+    r"""Enum of different ways to represent a box.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                    up z
+                       ^   x front
+                       |  /
+                       | /
+        left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+
+    Coordinates in camera:
+
+    .. code-block:: none
+
+                z front
+               /
+              /
+             0 ------> x right
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
+    and the yaw is around the y axis, thus the rotation axis=1.
+
+    Coordinates in Depth mode:
+
+    .. code-block:: none
+
+        up z
+           ^   y front
+           |  /
+           | /
+           0 ------> x right
+
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    """
+
+    LIDAR = 0
+    CAM = 1
+    DEPTH = 2
+
+    @staticmethod
+    def convert(box, src, dst, rt_mat=None, with_yaw=True):
+        """Convert boxes from `src` mode to `dst` mode.
+
+        Args:
+            box (tuple | list | np.ndarray |
+                torch.Tensor | :obj:`BaseInstance3DBoxes`):
+                Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
+            src (:obj:`Box3DMode`): The src Box mode.
+            dst (:obj:`Box3DMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+            with_yaw (bool, optional): If `box` is an instance of
+                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
+                Defaults to True.
+
+        Returns:
+            (tuple | list | np.ndarray | torch.Tensor |
+                :obj:`BaseInstance3DBoxes`):
+                The converted box of the same type.
+        """
+        if src == dst:
+            return box
+
+        is_numpy = isinstance(box, np.ndarray)
+        is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            assert len(box) >= 7, (
+                'Box3DMode.convert takes either a k-tuple/list or '
+                'an Nxk array/tensor, where k >= 7')
+            arr = torch.tensor(box)[None, :]
+        else:
+            # avoid modifying the input box
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(box)).clone()
+            elif is_Instance3DBoxes:
+                arr = box.tensor.clone()
+            else:
+                arr = box.clone()
+
+        if is_Instance3DBoxes:
+            with_yaw = box.with_yaw
+
+        # convert box from `src` mode to `dst` mode.
+        x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
+        if with_yaw:
+            yaw = arr[..., 6:7]
+        if src == Box3DMode.LIDAR and dst == Box3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                yaw = -yaw - np.pi / 2
+                yaw = limit_period(yaw, period=np.pi * 2)
+        elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                yaw = -yaw - np.pi / 2
+                yaw = limit_period(yaw, period=np.pi * 2)
+        elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                yaw = -yaw
+        elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+            if with_yaw:
+                yaw = -yaw
+        elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
+            if with_yaw:
+                yaw = yaw + np.pi / 2
+                yaw = limit_period(yaw, period=np.pi * 2)
+        elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([x_size, y_size, z_size], dim=-1)
+            if with_yaw:
+                yaw = yaw - np.pi / 2
+                yaw = limit_period(yaw, period=np.pi * 2)
+        else:
+            raise NotImplementedError(
+                f'Conversion from Box3DMode {src} to {dst} '
+                'is not supported yet')
+
+        if not isinstance(rt_mat, torch.Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat(
+                [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[..., :3] @ rt_mat.t()
+
+        if with_yaw:
+            remains = arr[..., 7:]
+            arr = torch.cat([xyz[..., :3], xyz_size, yaw, remains], dim=-1)
+        else:
+            remains = arr[..., 6:]
+            arr = torch.cat([xyz[..., :3], xyz_size, remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(box)
+        if single_box:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_Instance3DBoxes:
+            if dst == Box3DMode.CAM:
+                target_type = CameraInstance3DBoxes
+            elif dst == Box3DMode.LIDAR:
+                target_type = LiDARInstance3DBoxes
+            elif dst == Box3DMode.DEPTH:
+                target_type = DepthInstance3DBoxes
+            else:
+                raise NotImplementedError(
+                    f'Conversion to {dst} through {original_type}'
+                    ' is not supported yet')
+            return target_type(arr, box_dim=arr.size(-1), with_yaw=with_yaw)
+        else:
+            return arr
diff --git a/mmdet3d/core/bbox/structures/cam_box3d.py b/mmdet3d/core/bbox/structures/cam_box3d.py
index b708613..edaba2c 100644
--- a/mmdet3d/core/bbox/structures/cam_box3d.py
+++ b/mmdet3d/core/bbox/structures/cam_box3d.py
@@ -1,354 +1,354 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from ...points import BasePoints
-from .base_box3d import BaseInstance3DBoxes
-from .utils import rotation_3d_in_axis, yaw2local
-
-
-class CameraInstance3DBoxes(BaseInstance3DBoxes):
-    """3D boxes of instances in CAM coordinates.
-
-    Coordinates in camera:
-
-    .. code-block:: none
-
-                z front (yaw=-0.5*pi)
-               /
-              /
-             0 ------> x right (yaw=0)
-             |
-             |
-             v
-        down y
-
-    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
-    and the yaw is around the y axis, thus the rotation axis=1.
-    The yaw is 0 at the positive direction of x axis, and decreases from
-    the positive direction of x to the positive direction of z.
-
-    Attributes:
-        tensor (torch.Tensor): Float matrix in shape (N, box_dim).
-        box_dim (int): Integer indicating the dimension of a box
-            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
-        with_yaw (bool): If True, the value of yaw will be set to 0 as
-            axis-aligned boxes tightly enclosing the original boxes.
-    """
-    YAW_AXIS = 1
-
-    def __init__(self,
-                 tensor,
-                 box_dim=7,
-                 with_yaw=True,
-                 origin=(0.5, 1.0, 0.5)):
-        if isinstance(tensor, torch.Tensor):
-            device = tensor.device
-        else:
-            device = torch.device('cpu')
-        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
-        if tensor.numel() == 0:
-            # Use reshape, so we don't end up creating a new tensor that
-            # does not depend on the inputs (and consequently confuses jit)
-            tensor = tensor.reshape((0, box_dim)).to(
-                dtype=torch.float32, device=device)
-        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
-
-        if tensor.shape[-1] == 6:
-            # If the dimension of boxes is 6, we expand box_dim by padding
-            # 0 as a fake yaw and set with_yaw to False.
-            assert box_dim == 6
-            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
-            tensor = torch.cat((tensor, fake_rot), dim=-1)
-            self.box_dim = box_dim + 1
-            self.with_yaw = False
-        else:
-            self.box_dim = box_dim
-            self.with_yaw = with_yaw
-        self.tensor = tensor.clone()
-
-        if origin != (0.5, 1.0, 0.5):
-            dst = self.tensor.new_tensor((0.5, 1.0, 0.5))
-            src = self.tensor.new_tensor(origin)
-            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
-
-    @property
-    def height(self):
-        """torch.Tensor: A vector with height of each box in shape (N, )."""
-        return self.tensor[:, 4]
-
-    @property
-    def top_height(self):
-        """torch.Tensor:
-            A vector with the top height of each box in shape (N, )."""
-        # the positive direction is down rather than up
-        return self.bottom_height - self.height
-
-    @property
-    def bottom_height(self):
-        """torch.Tensor:
-            A vector with bottom's height of each box in shape (N, )."""
-        return self.tensor[:, 1]
-
-    @property
-    def local_yaw(self):
-        """torch.Tensor:
-            A vector with local yaw of each box in shape (N, ).
-            local_yaw equals to alpha in kitti, which is commonly
-            used in monocular 3D object detection task, so only
-            :obj:`CameraInstance3DBoxes` has the property.
-        """
-        yaw = self.yaw
-        loc = self.gravity_center
-        local_yaw = yaw2local(yaw, loc)
-
-        return local_yaw
-
-    @property
-    def gravity_center(self):
-        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
-        bottom_center = self.bottom_center
-        gravity_center = torch.zeros_like(bottom_center)
-        gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]]
-        gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5
-        return gravity_center
-
-    @property
-    def corners(self):
-        """torch.Tensor: Coordinates of corners of all the boxes in
-                         shape (N, 8, 3).
-
-        Convert the boxes to  in clockwise order, in the form of
-        (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)
-
-        .. code-block:: none
-
-                         front z
-                              /
-                             /
-               (x0, y0, z1) + -----------  + (x1, y0, z1)
-                           /|            / |
-                          / |           /  |
-            (x0, y0, z0) + ----------- +   + (x1, y1, z1)
-                         |  /      .   |  /
-                         | / origin    | /
-            (x0, y1, z0) + ----------- + -------> x right
-                         |             (x1, y1, z0)
-                         |
-                         v
-                    down y
-        """
-        if self.tensor.numel() == 0:
-            return torch.empty([0, 8, 3], device=self.tensor.device)
-
-        dims = self.dims
-        corners_norm = torch.from_numpy(
-            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
-                device=dims.device, dtype=dims.dtype)
-
-        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
-        # use relative origin [0.5, 1, 0.5]
-        corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5])
-        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
-
-        corners = rotation_3d_in_axis(
-            corners, self.tensor[:, 6], axis=self.YAW_AXIS)
-        corners += self.tensor[:, :3].view(-1, 1, 3)
-        return corners
-
-    @property
-    def bev(self):
-        """torch.Tensor: 2D BEV box of each box with rotation
-            in XYWHR format, in shape (N, 5)."""
-        bev = self.tensor[:, [0, 2, 3, 5, 6]].clone()
-        # positive direction of the gravity axis
-        # in cam coord system points to the earth
-        # so the bev yaw angle needs to be reversed
-        bev[:, -1] = -bev[:, -1]
-        return bev
-
-    def rotate(self, angle, points=None):
-        """Rotate boxes with points (optional) with the given angle or rotation
-        matrix.
-
-        Args:
-            angle (float | torch.Tensor | np.ndarray):
-                Rotation angle or rotation matrix.
-            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
-                Points to rotate. Defaults to None.
-
-        Returns:
-            tuple or None: When ``points`` is None, the function returns
-                None, otherwise it returns the rotated points and the
-                rotation matrix ``rot_mat_T``.
-        """
-        if not isinstance(angle, torch.Tensor):
-            angle = self.tensor.new_tensor(angle)
-
-        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
-            f'invalid rotation angle shape {angle.shape}'
-
-        if angle.numel() == 1:
-            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
-                self.tensor[:, 0:3],
-                angle,
-                axis=self.YAW_AXIS,
-                return_mat=True)
-        else:
-            rot_mat_T = angle
-            rot_sin = rot_mat_T[2, 0]
-            rot_cos = rot_mat_T[0, 0]
-            angle = np.arctan2(rot_sin, rot_cos)
-            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
-
-        self.tensor[:, 6] += angle
-
-        if points is not None:
-            if isinstance(points, torch.Tensor):
-                points[:, :3] = points[:, :3] @ rot_mat_T
-            elif isinstance(points, np.ndarray):
-                rot_mat_T = rot_mat_T.cpu().numpy()
-                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
-            elif isinstance(points, BasePoints):
-                points.rotate(rot_mat_T)
-            else:
-                raise ValueError
-            return points, rot_mat_T
-
-    def flip(self, bev_direction='horizontal', points=None):
-        """Flip the boxes in BEV along given BEV direction.
-
-        In CAM coordinates, it flips the x (horizontal) or z (vertical) axis.
-
-        Args:
-            bev_direction (str): Flip direction (horizontal or vertical).
-            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
-                Points to flip. Defaults to None.
-
-        Returns:
-            torch.Tensor, numpy.ndarray or None: Flipped points.
-        """
-        assert bev_direction in ('horizontal', 'vertical')
-        if bev_direction == 'horizontal':
-            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
-            if self.with_yaw:
-                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
-        elif bev_direction == 'vertical':
-            self.tensor[:, 2::7] = -self.tensor[:, 2::7]
-            if self.with_yaw:
-                self.tensor[:, 6] = -self.tensor[:, 6]
-
-        if points is not None:
-            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
-            if isinstance(points, (torch.Tensor, np.ndarray)):
-                if bev_direction == 'horizontal':
-                    points[:, 0] = -points[:, 0]
-                elif bev_direction == 'vertical':
-                    points[:, 2] = -points[:, 2]
-            elif isinstance(points, BasePoints):
-                points.flip(bev_direction)
-            return points
-
-    @classmethod
-    def height_overlaps(cls, boxes1, boxes2, mode='iou'):
-        """Calculate height overlaps of two boxes.
-
-        This function calculates the height overlaps between ``boxes1`` and
-        ``boxes2``, where ``boxes1`` and ``boxes2`` should be in the same type.
-
-        Args:
-            boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes.
-            boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes.
-            mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
-
-        Returns:
-            torch.Tensor: Calculated iou of boxes' heights.
-        """
-        assert isinstance(boxes1, CameraInstance3DBoxes)
-        assert isinstance(boxes2, CameraInstance3DBoxes)
-
-        boxes1_top_height = boxes1.top_height.view(-1, 1)
-        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
-        boxes2_top_height = boxes2.top_height.view(1, -1)
-        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
-
-        # positive direction of the gravity axis
-        # in cam coord system points to the earth
-        heighest_of_bottom = torch.min(boxes1_bottom_height,
-                                       boxes2_bottom_height)
-        lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height)
-        overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0)
-        return overlaps_h
-
-    def convert_to(self, dst, rt_mat=None):
-        """Convert self to ``dst`` mode.
-
-        Args:
-            dst (:obj:`Box3DMode`): The target Box mode.
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from ``src`` coordinates to ``dst`` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-
-        Returns:
-            :obj:`BaseInstance3DBoxes`:
-                The converted box of the same type in the ``dst`` mode.
-        """
-        from .box_3d_mode import Box3DMode
-        return Box3DMode.convert(
-            box=self, src=Box3DMode.CAM, dst=dst, rt_mat=rt_mat)
-
-    def points_in_boxes_part(self, points, boxes_override=None):
-        """Find the box in which each point is.
-
-        Args:
-            points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
-                3 dimensions are (x, y, z) in LiDAR or depth coordinate.
-            boxes_override (torch.Tensor, optional): Boxes to override
-                `self.tensor `. Defaults to None.
-
-        Returns:
-            torch.Tensor: The index of the box in which
-                each point is, in shape (M, ). Default value is -1
-                (if the point is not enclosed by any box).
-        """
-        from .coord_3d_mode import Coord3DMode
-
-        points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM,
-                                           Coord3DMode.LIDAR)
-        if boxes_override is not None:
-            boxes_lidar = boxes_override
-        else:
-            boxes_lidar = Coord3DMode.convert(self.tensor, Coord3DMode.CAM,
-                                              Coord3DMode.LIDAR)
-
-        box_idx = super().points_in_boxes_part(points_lidar, boxes_lidar)
-        return box_idx
-
-    def points_in_boxes_all(self, points, boxes_override=None):
-        """Find all boxes in which each point is.
-
-        Args:
-            points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
-                3 dimensions are (x, y, z) in LiDAR or depth coordinate.
-            boxes_override (torch.Tensor, optional): Boxes to override
-                `self.tensor `. Defaults to None.
-
-        Returns:
-            torch.Tensor: The index of all boxes in which each point is,
-                in shape (B, M, T).
-        """
-        from .coord_3d_mode import Coord3DMode
-
-        points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM,
-                                           Coord3DMode.LIDAR)
-        if boxes_override is not None:
-            boxes_lidar = boxes_override
-        else:
-            boxes_lidar = Coord3DMode.convert(self.tensor, Coord3DMode.CAM,
-                                              Coord3DMode.LIDAR)
-
-        box_idx = super().points_in_boxes_all(points_lidar, boxes_lidar)
-        return box_idx
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from ...points import BasePoints
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis, yaw2local
+
+
+class CameraInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in CAM coordinates.
+
+    Coordinates in camera:
+
+    .. code-block:: none
+
+                z front (yaw=-0.5*pi)
+               /
+              /
+             0 ------> x right (yaw=0)
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
+    and the yaw is around the y axis, thus the rotation axis=1.
+    The yaw is 0 at the positive direction of x axis, and decreases from
+    the positive direction of x to the positive direction of z.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix in shape (N, box_dim).
+        box_dim (int): Integer indicating the dimension of a box
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as
+            axis-aligned boxes tightly enclosing the original boxes.
+    """
+    YAW_AXIS = 1
+
+    def __init__(self,
+                 tensor,
+                 box_dim=7,
+                 with_yaw=True,
+                 origin=(0.5, 1.0, 0.5)):
+        if isinstance(tensor, torch.Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that
+            # does not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, box_dim)).to(
+                dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
+
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding
+            # 0 as a fake yaw and set with_yaw to False.
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 1
+            self.with_yaw = False
+        else:
+            self.box_dim = box_dim
+            self.with_yaw = with_yaw
+        self.tensor = tensor.clone()
+
+        if origin != (0.5, 1.0, 0.5):
+            dst = self.tensor.new_tensor((0.5, 1.0, 0.5))
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+    @property
+    def height(self):
+        """torch.Tensor: A vector with height of each box in shape (N, )."""
+        return self.tensor[:, 4]
+
+    @property
+    def top_height(self):
+        """torch.Tensor:
+            A vector with the top height of each box in shape (N, )."""
+        # the positive direction is down rather than up
+        return self.bottom_height - self.height
+
+    @property
+    def bottom_height(self):
+        """torch.Tensor:
+            A vector with bottom's height of each box in shape (N, )."""
+        return self.tensor[:, 1]
+
+    @property
+    def local_yaw(self):
+        """torch.Tensor:
+            A vector with local yaw of each box in shape (N, ).
+            local_yaw equals to alpha in kitti, which is commonly
+            used in monocular 3D object detection task, so only
+            :obj:`CameraInstance3DBoxes` has the property.
+        """
+        yaw = self.yaw
+        loc = self.gravity_center
+        local_yaw = yaw2local(yaw, loc)
+
+        return local_yaw
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]]
+        gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self):
+        """torch.Tensor: Coordinates of corners of all the boxes in
+                         shape (N, 8, 3).
+
+        Convert the boxes to  in clockwise order, in the form of
+        (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)
+
+        .. code-block:: none
+
+                         front z
+                              /
+                             /
+               (x0, y0, z1) + -----------  + (x1, y0, z1)
+                           /|            / |
+                          / |           /  |
+            (x0, y0, z0) + ----------- +   + (x1, y1, z1)
+                         |  /      .   |  /
+                         | / origin    | /
+            (x0, y1, z0) + ----------- + -------> x right
+                         |             (x1, y1, z0)
+                         |
+                         v
+                    down y
+        """
+        if self.tensor.numel() == 0:
+            return torch.empty([0, 8, 3], device=self.tensor.device)
+
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+                device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin [0.5, 1, 0.5]
+        corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        corners = rotation_3d_in_axis(
+            corners, self.tensor[:, 6], axis=self.YAW_AXIS)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    @property
+    def bev(self):
+        """torch.Tensor: 2D BEV box of each box with rotation
+            in XYWHR format, in shape (N, 5)."""
+        bev = self.tensor[:, [0, 2, 3, 5, 6]].clone()
+        # positive direction of the gravity axis
+        # in cam coord system points to the earth
+        # so the bev yaw angle needs to be reversed
+        bev[:, -1] = -bev[:, -1]
+        return bev
+
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns
+                None, otherwise it returns the rotated points and the
+                rotation matrix ``rot_mat_T``.
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, 0:3],
+                angle,
+                axis=self.YAW_AXIS,
+                return_mat=True)
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[2, 0]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+
+        self.tensor[:, 6] += angle
+
+        if points is not None:
+            if isinstance(points, torch.Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.cpu().numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                points.rotate(rot_mat_T)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(self, bev_direction='horizontal', points=None):
+        """Flip the boxes in BEV along given BEV direction.
+
+        In CAM coordinates, it flips the x (horizontal) or z (vertical) axis.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            torch.Tensor, numpy.ndarray or None: Flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 2::7] = -self.tensor[:, 2::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (torch.Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 0] = -points[:, 0]
+                elif bev_direction == 'vertical':
+                    points[:, 2] = -points[:, 2]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    @classmethod
+    def height_overlaps(cls, boxes1, boxes2, mode='iou'):
+        """Calculate height overlaps of two boxes.
+
+        This function calculates the height overlaps between ``boxes1`` and
+        ``boxes2``, where ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+        Returns:
+            torch.Tensor: Calculated iou of boxes' heights.
+        """
+        assert isinstance(boxes1, CameraInstance3DBoxes)
+        assert isinstance(boxes2, CameraInstance3DBoxes)
+
+        boxes1_top_height = boxes1.top_height.view(-1, 1)
+        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+        boxes2_top_height = boxes2.top_height.view(1, -1)
+        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+        # positive direction of the gravity axis
+        # in cam coord system points to the earth
+        heighest_of_bottom = torch.min(boxes1_bottom_height,
+                                       boxes2_bottom_height)
+        lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height)
+        overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0)
+        return overlaps_h
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`Box3DMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from ``src`` coordinates to ``dst`` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`:
+                The converted box of the same type in the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self, src=Box3DMode.CAM, dst=dst, rt_mat=rt_mat)
+
+    def points_in_boxes_part(self, points, boxes_override=None):
+        """Find the box in which each point is.
+
+        Args:
+            points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
+                3 dimensions are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (torch.Tensor, optional): Boxes to override
+                `self.tensor `. Defaults to None.
+
+        Returns:
+            torch.Tensor: The index of the box in which
+                each point is, in shape (M, ). Default value is -1
+                (if the point is not enclosed by any box).
+        """
+        from .coord_3d_mode import Coord3DMode
+
+        points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM,
+                                           Coord3DMode.LIDAR)
+        if boxes_override is not None:
+            boxes_lidar = boxes_override
+        else:
+            boxes_lidar = Coord3DMode.convert(self.tensor, Coord3DMode.CAM,
+                                              Coord3DMode.LIDAR)
+
+        box_idx = super().points_in_boxes_part(points_lidar, boxes_lidar)
+        return box_idx
+
+    def points_in_boxes_all(self, points, boxes_override=None):
+        """Find all boxes in which each point is.
+
+        Args:
+            points (torch.Tensor): Points in shape (1, M, 3) or (M, 3),
+                3 dimensions are (x, y, z) in LiDAR or depth coordinate.
+            boxes_override (torch.Tensor, optional): Boxes to override
+                `self.tensor `. Defaults to None.
+
+        Returns:
+            torch.Tensor: The index of all boxes in which each point is,
+                in shape (B, M, T).
+        """
+        from .coord_3d_mode import Coord3DMode
+
+        points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM,
+                                           Coord3DMode.LIDAR)
+        if boxes_override is not None:
+            boxes_lidar = boxes_override
+        else:
+            boxes_lidar = Coord3DMode.convert(self.tensor, Coord3DMode.CAM,
+                                              Coord3DMode.LIDAR)
+
+        box_idx = super().points_in_boxes_all(points_lidar, boxes_lidar)
+        return box_idx
diff --git a/mmdet3d/core/bbox/structures/coord_3d_mode.py b/mmdet3d/core/bbox/structures/coord_3d_mode.py
index 6309b65..5748881 100644
--- a/mmdet3d/core/bbox/structures/coord_3d_mode.py
+++ b/mmdet3d/core/bbox/structures/coord_3d_mode.py
@@ -1,234 +1,234 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from enum import IntEnum, unique
-
-import numpy as np
-import torch
-
-from ...points import BasePoints, CameraPoints, DepthPoints, LiDARPoints
-from .base_box3d import BaseInstance3DBoxes
-from .box_3d_mode import Box3DMode
-
-
-@unique
-class Coord3DMode(IntEnum):
-    r"""Enum of different ways to represent a box
-        and point cloud.
-
-    Coordinates in LiDAR:
-
-    .. code-block:: none
-
-                    up z
-                       ^   x front
-                       |  /
-                       | /
-        left y <------ 0
-
-    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
-    and the yaw is around the z axis, thus the rotation axis=2.
-
-    Coordinates in camera:
-
-    .. code-block:: none
-
-                z front
-               /
-              /
-             0 ------> x right
-             |
-             |
-             v
-        down y
-
-    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
-    and the yaw is around the y axis, thus the rotation axis=1.
-
-    Coordinates in Depth mode:
-
-    .. code-block:: none
-
-        up z
-           ^   y front
-           |  /
-           | /
-           0 ------> x right
-
-    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
-    and the yaw is around the z axis, thus the rotation axis=2.
-    """
-
-    LIDAR = 0
-    CAM = 1
-    DEPTH = 2
-
-    @staticmethod
-    def convert(input, src, dst, rt_mat=None, with_yaw=True, is_point=True):
-        """Convert boxes or points from `src` mode to `dst` mode.
-
-        Args:
-            input (tuple | list | np.ndarray | torch.Tensor |
-                :obj:`BaseInstance3DBoxes` | :obj:`BasePoints`):
-                Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
-            src (:obj:`Box3DMode` | :obj:`Coord3DMode`): The source mode.
-            dst (:obj:`Box3DMode` | :obj:`Coord3DMode`): The target mode.
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from `src` coordinates to `dst` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-            with_yaw (bool): If `box` is an instance of
-                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
-                Defaults to True.
-            is_point (bool): If `input` is neither an instance of
-                :obj:`BaseInstance3DBoxes` nor an instance of
-                :obj:`BasePoints`, whether or not it is point data.
-                Defaults to True.
-
-        Returns:
-            (tuple | list | np.ndarray | torch.Tensor |
-                :obj:`BaseInstance3DBoxes` | :obj:`BasePoints`):
-                The converted box of the same type.
-        """
-        if isinstance(input, BaseInstance3DBoxes):
-            return Coord3DMode.convert_box(
-                input, src, dst, rt_mat=rt_mat, with_yaw=with_yaw)
-        elif isinstance(input, BasePoints):
-            return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat)
-        elif isinstance(input, (tuple, list, np.ndarray, torch.Tensor)):
-            if is_point:
-                return Coord3DMode.convert_point(
-                    input, src, dst, rt_mat=rt_mat)
-            else:
-                return Coord3DMode.convert_box(
-                    input, src, dst, rt_mat=rt_mat, with_yaw=with_yaw)
-        else:
-            raise NotImplementedError
-
-    @staticmethod
-    def convert_box(box, src, dst, rt_mat=None, with_yaw=True):
-        """Convert boxes from `src` mode to `dst` mode.
-
-        Args:
-            box (tuple | list | np.ndarray |
-                torch.Tensor | :obj:`BaseInstance3DBoxes`):
-                Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
-            src (:obj:`Box3DMode`): The src Box mode.
-            dst (:obj:`Box3DMode`): The target Box mode.
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from `src` coordinates to `dst` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-            with_yaw (bool): If `box` is an instance of
-                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
-                Defaults to True.
-
-        Returns:
-            (tuple | list | np.ndarray | torch.Tensor |
-                :obj:`BaseInstance3DBoxes`):
-                The converted box of the same type.
-        """
-        return Box3DMode.convert(box, src, dst, rt_mat=rt_mat)
-
-    @staticmethod
-    def convert_point(point, src, dst, rt_mat=None):
-        """Convert points from `src` mode to `dst` mode.
-
-        Args:
-            point (tuple | list | np.ndarray |
-                torch.Tensor | :obj:`BasePoints`):
-                Can be a k-tuple, k-list or an Nxk array/tensor.
-            src (:obj:`CoordMode`): The src Point mode.
-            dst (:obj:`CoordMode`): The target Point mode.
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from `src` coordinates to `dst` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-
-        Returns:
-            (tuple | list | np.ndarray | torch.Tensor | :obj:`BasePoints`):
-                The converted point of the same type.
-        """
-        if src == dst:
-            return point
-
-        is_numpy = isinstance(point, np.ndarray)
-        is_InstancePoints = isinstance(point, BasePoints)
-        single_point = isinstance(point, (list, tuple))
-        if single_point:
-            assert len(point) >= 3, (
-                'CoordMode.convert takes either a k-tuple/list or '
-                'an Nxk array/tensor, where k >= 3')
-            arr = torch.tensor(point)[None, :]
-        else:
-            # avoid modifying the input point
-            if is_numpy:
-                arr = torch.from_numpy(np.asarray(point)).clone()
-            elif is_InstancePoints:
-                arr = point.tensor.clone()
-            else:
-                arr = point.clone()
-
-        # convert point from `src` mode to `dst` mode.
-        if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
-        elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
-        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
-        elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
-        elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
-        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
-            if rt_mat is None:
-                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
-        else:
-            raise NotImplementedError(
-                f'Conversion from Coord3DMode {src} to {dst} '
-                'is not supported yet')
-
-        if not isinstance(rt_mat, torch.Tensor):
-            rt_mat = arr.new_tensor(rt_mat)
-        if rt_mat.size(1) == 4:
-            extended_xyz = torch.cat(
-                [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
-            xyz = extended_xyz @ rt_mat.t()
-        else:
-            xyz = arr[..., :3] @ rt_mat.t()
-
-        remains = arr[..., 3:]
-        arr = torch.cat([xyz[..., :3], remains], dim=-1)
-
-        # convert arr to the original type
-        original_type = type(point)
-        if single_point:
-            return original_type(arr.flatten().tolist())
-        if is_numpy:
-            return arr.numpy()
-        elif is_InstancePoints:
-            if dst == Coord3DMode.CAM:
-                target_type = CameraPoints
-            elif dst == Coord3DMode.LIDAR:
-                target_type = LiDARPoints
-            elif dst == Coord3DMode.DEPTH:
-                target_type = DepthPoints
-            else:
-                raise NotImplementedError(
-                    f'Conversion to {dst} through {original_type}'
-                    ' is not supported yet')
-            return target_type(
-                arr,
-                points_dim=arr.size(-1),
-                attribute_dims=point.attribute_dims)
-        else:
-            return arr
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import IntEnum, unique
+
+import numpy as np
+import torch
+
+from ...points import BasePoints, CameraPoints, DepthPoints, LiDARPoints
+from .base_box3d import BaseInstance3DBoxes
+from .box_3d_mode import Box3DMode
+
+
+@unique
+class Coord3DMode(IntEnum):
+    r"""Enum of different ways to represent a box
+        and point cloud.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                    up z
+                       ^   x front
+                       |  /
+                       | /
+        left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+
+    Coordinates in camera:
+
+    .. code-block:: none
+
+                z front
+               /
+              /
+             0 ------> x right
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
+    and the yaw is around the y axis, thus the rotation axis=1.
+
+    Coordinates in Depth mode:
+
+    .. code-block:: none
+
+        up z
+           ^   y front
+           |  /
+           | /
+           0 ------> x right
+
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    """
+
+    LIDAR = 0
+    CAM = 1
+    DEPTH = 2
+
+    @staticmethod
+    def convert(input, src, dst, rt_mat=None, with_yaw=True, is_point=True):
+        """Convert boxes or points from `src` mode to `dst` mode.
+
+        Args:
+            input (tuple | list | np.ndarray | torch.Tensor |
+                :obj:`BaseInstance3DBoxes` | :obj:`BasePoints`):
+                Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
+            src (:obj:`Box3DMode` | :obj:`Coord3DMode`): The source mode.
+            dst (:obj:`Box3DMode` | :obj:`Coord3DMode`): The target mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+            with_yaw (bool): If `box` is an instance of
+                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
+                Defaults to True.
+            is_point (bool): If `input` is neither an instance of
+                :obj:`BaseInstance3DBoxes` nor an instance of
+                :obj:`BasePoints`, whether or not it is point data.
+                Defaults to True.
+
+        Returns:
+            (tuple | list | np.ndarray | torch.Tensor |
+                :obj:`BaseInstance3DBoxes` | :obj:`BasePoints`):
+                The converted box of the same type.
+        """
+        if isinstance(input, BaseInstance3DBoxes):
+            return Coord3DMode.convert_box(
+                input, src, dst, rt_mat=rt_mat, with_yaw=with_yaw)
+        elif isinstance(input, BasePoints):
+            return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat)
+        elif isinstance(input, (tuple, list, np.ndarray, torch.Tensor)):
+            if is_point:
+                return Coord3DMode.convert_point(
+                    input, src, dst, rt_mat=rt_mat)
+            else:
+                return Coord3DMode.convert_box(
+                    input, src, dst, rt_mat=rt_mat, with_yaw=with_yaw)
+        else:
+            raise NotImplementedError
+
+    @staticmethod
+    def convert_box(box, src, dst, rt_mat=None, with_yaw=True):
+        """Convert boxes from `src` mode to `dst` mode.
+
+        Args:
+            box (tuple | list | np.ndarray |
+                torch.Tensor | :obj:`BaseInstance3DBoxes`):
+                Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
+            src (:obj:`Box3DMode`): The src Box mode.
+            dst (:obj:`Box3DMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+            with_yaw (bool): If `box` is an instance of
+                :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle.
+                Defaults to True.
+
+        Returns:
+            (tuple | list | np.ndarray | torch.Tensor |
+                :obj:`BaseInstance3DBoxes`):
+                The converted box of the same type.
+        """
+        return Box3DMode.convert(box, src, dst, rt_mat=rt_mat)
+
+    @staticmethod
+    def convert_point(point, src, dst, rt_mat=None):
+        """Convert points from `src` mode to `dst` mode.
+
+        Args:
+            point (tuple | list | np.ndarray |
+                torch.Tensor | :obj:`BasePoints`):
+                Can be a k-tuple, k-list or an Nxk array/tensor.
+            src (:obj:`CoordMode`): The src Point mode.
+            dst (:obj:`CoordMode`): The target Point mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            (tuple | list | np.ndarray | torch.Tensor | :obj:`BasePoints`):
+                The converted point of the same type.
+        """
+        if src == dst:
+            return point
+
+        is_numpy = isinstance(point, np.ndarray)
+        is_InstancePoints = isinstance(point, BasePoints)
+        single_point = isinstance(point, (list, tuple))
+        if single_point:
+            assert len(point) >= 3, (
+                'CoordMode.convert takes either a k-tuple/list or '
+                'an Nxk array/tensor, where k >= 3')
+            arr = torch.tensor(point)[None, :]
+        else:
+            # avoid modifying the input point
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(point)).clone()
+            elif is_InstancePoints:
+                arr = point.tensor.clone()
+            else:
+                arr = point.clone()
+
+        # convert point from `src` mode to `dst` mode.
+        if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+        elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+        else:
+            raise NotImplementedError(
+                f'Conversion from Coord3DMode {src} to {dst} '
+                'is not supported yet')
+
+        if not isinstance(rt_mat, torch.Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat(
+                [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[..., :3] @ rt_mat.t()
+
+        remains = arr[..., 3:]
+        arr = torch.cat([xyz[..., :3], remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(point)
+        if single_point:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_InstancePoints:
+            if dst == Coord3DMode.CAM:
+                target_type = CameraPoints
+            elif dst == Coord3DMode.LIDAR:
+                target_type = LiDARPoints
+            elif dst == Coord3DMode.DEPTH:
+                target_type = DepthPoints
+            else:
+                raise NotImplementedError(
+                    f'Conversion to {dst} through {original_type}'
+                    ' is not supported yet')
+            return target_type(
+                arr,
+                points_dim=arr.size(-1),
+                attribute_dims=point.attribute_dims)
+        else:
+            return arr
diff --git a/mmdet3d/core/bbox/structures/depth_box3d.py b/mmdet3d/core/bbox/structures/depth_box3d.py
index dd9278b..75b970e 100644
--- a/mmdet3d/core/bbox/structures/depth_box3d.py
+++ b/mmdet3d/core/bbox/structures/depth_box3d.py
@@ -1,270 +1,270 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from mmdet3d.core.points import BasePoints
-from .base_box3d import BaseInstance3DBoxes
-from .utils import rotation_3d_in_axis
-
-
-class DepthInstance3DBoxes(BaseInstance3DBoxes):
-    """3D boxes of instances in Depth coordinates.
-
-    Coordinates in Depth:
-
-    .. code-block:: none
-
-                    up z    y front (yaw=-0.5*pi)
-                       ^   ^
-                       |  /
-                       | /
-                       0 ------> x right (yaw=0)
-
-    The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
-    and the yaw is around the z axis, thus the rotation axis=2.
-    The yaw is 0 at the positive direction of x axis, and decreases from
-    the positive direction of x to the positive direction of y.
-    Also note that rotation of DepthInstance3DBoxes is counterclockwise,
-    which is reverse to the definition of the yaw angle (clockwise).
-
-    A refactor is ongoing to make the three coordinate systems
-    easier to understand and convert between each other.
-
-    Attributes:
-        tensor (torch.Tensor): Float matrix of N x box_dim.
-        box_dim (int): Integer indicates the dimension of a box
-            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
-        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
-            boxes.
-    """
-    YAW_AXIS = 2
-
-    @property
-    def gravity_center(self):
-        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
-        bottom_center = self.bottom_center
-        gravity_center = torch.zeros_like(bottom_center)
-        gravity_center[:, :2] = bottom_center[:, :2]
-        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
-        return gravity_center
-
-    @property
-    def corners(self):
-        """torch.Tensor: Coordinates of corners of all the boxes
-        in shape (N, 8, 3).
-
-        Convert the boxes to corners in clockwise order, in form of
-        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
-
-        .. code-block:: none
-
-                                           up z
-                            front y           ^
-                                 /            |
-                                /             |
-                  (x0, y1, z1) + -----------  + (x1, y1, z1)
-                              /|            / |
-                             / |           /  |
-               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
-                            |  /      .   |  /
-                            | / origin    | /
-               (x0, y0, z0) + ----------- + --------> right x
-                                          (x1, y0, z0)
-        """
-        if self.tensor.numel() == 0:
-            return torch.empty([0, 8, 3], device=self.tensor.device)
-
-        dims = self.dims
-        corners_norm = torch.from_numpy(
-            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
-                device=dims.device, dtype=dims.dtype)
-
-        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
-        # use relative origin (0.5, 0.5, 0)
-        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
-        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
-
-        # rotate around z axis
-        corners = rotation_3d_in_axis(
-            corners, self.tensor[:, 6], axis=self.YAW_AXIS)
-        corners += self.tensor[:, :3].view(-1, 1, 3)
-        return corners
-
-    def rotate(self, angle, points=None):
-        """Rotate boxes with points (optional) with the given angle or rotation
-        matrix.
-
-        Args:
-            angle (float | torch.Tensor | np.ndarray):
-                Rotation angle or rotation matrix.
-            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
-                Points to rotate. Defaults to None.
-
-        Returns:
-            tuple or None: When ``points`` is None, the function returns
-                None, otherwise it returns the rotated points and the
-                rotation matrix ``rot_mat_T``.
-        """
-        if not isinstance(angle, torch.Tensor):
-            angle = self.tensor.new_tensor(angle)
-
-        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
-            f'invalid rotation angle shape {angle.shape}'
-
-        if angle.numel() == 1:
-            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
-                self.tensor[:, 0:3],
-                angle,
-                axis=self.YAW_AXIS,
-                return_mat=True)
-        else:
-            rot_mat_T = angle
-            rot_sin = rot_mat_T[0, 1]
-            rot_cos = rot_mat_T[0, 0]
-            angle = np.arctan2(rot_sin, rot_cos)
-            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
-
-        if self.with_yaw:
-            self.tensor[:, 6] += angle
-        else:
-            # for axis-aligned boxes, we take the new
-            # enclosing axis-aligned boxes after rotation
-            corners_rot = self.corners @ rot_mat_T
-            new_x_size = corners_rot[..., 0].max(
-                dim=1, keepdim=True)[0] - corners_rot[..., 0].min(
-                    dim=1, keepdim=True)[0]
-            new_y_size = corners_rot[..., 1].max(
-                dim=1, keepdim=True)[0] - corners_rot[..., 1].min(
-                    dim=1, keepdim=True)[0]
-            self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1)
-
-        if points is not None:
-            if isinstance(points, torch.Tensor):
-                points[:, :3] = points[:, :3] @ rot_mat_T
-            elif isinstance(points, np.ndarray):
-                rot_mat_T = rot_mat_T.cpu().numpy()
-                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
-            elif isinstance(points, BasePoints):
-                points.rotate(rot_mat_T)
-            else:
-                raise ValueError
-            return points, rot_mat_T
-
-    def flip(self, bev_direction='horizontal', points=None):
-        """Flip the boxes in BEV along given BEV direction.
-
-        In Depth coordinates, it flips x (horizontal) or y (vertical) axis.
-
-        Args:
-            bev_direction (str, optional): Flip direction
-                (horizontal or vertical). Defaults to 'horizontal'.
-            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
-                Points to flip. Defaults to None.
-
-        Returns:
-            torch.Tensor, numpy.ndarray or None: Flipped points.
-        """
-        assert bev_direction in ('horizontal', 'vertical')
-        if bev_direction == 'horizontal':
-            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
-            if self.with_yaw:
-                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
-        elif bev_direction == 'vertical':
-            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
-            if self.with_yaw:
-                self.tensor[:, 6] = -self.tensor[:, 6]
-
-        if points is not None:
-            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
-            if isinstance(points, (torch.Tensor, np.ndarray)):
-                if bev_direction == 'horizontal':
-                    points[:, 0] = -points[:, 0]
-                elif bev_direction == 'vertical':
-                    points[:, 1] = -points[:, 1]
-            elif isinstance(points, BasePoints):
-                points.flip(bev_direction)
-            return points
-
-    def convert_to(self, dst, rt_mat=None):
-        """Convert self to ``dst`` mode.
-
-        Args:
-            dst (:obj:`Box3DMode`): The target Box mode.
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from ``src`` coordinates to ``dst`` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-
-        Returns:
-            :obj:`DepthInstance3DBoxes`:
-                The converted box of the same type in the ``dst`` mode.
-        """
-        from .box_3d_mode import Box3DMode
-        return Box3DMode.convert(
-            box=self, src=Box3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
-
-    def enlarged_box(self, extra_width):
-        """Enlarge the length, width and height boxes.
-
-        Args:
-            extra_width (float | torch.Tensor): Extra width to enlarge the box.
-
-        Returns:
-            :obj:`DepthInstance3DBoxes`: Enlarged boxes.
-        """
-        enlarged_boxes = self.tensor.clone()
-        enlarged_boxes[:, 3:6] += extra_width * 2
-        # bottom center z minus extra_width
-        enlarged_boxes[:, 2] -= extra_width
-        return self.new_box(enlarged_boxes)
-
-    def get_surface_line_center(self):
-        """Compute surface and line center of bounding boxes.
-
-        Returns:
-            torch.Tensor: Surface and line center of bounding boxes.
-        """
-        obj_size = self.dims
-        center = self.gravity_center.view(-1, 1, 3)
-        batch_size = center.shape[0]
-
-        rot_sin = torch.sin(-self.yaw)
-        rot_cos = torch.cos(-self.yaw)
-        rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3]))
-        rot_mat_T[..., 0, 0] = rot_cos
-        rot_mat_T[..., 0, 1] = -rot_sin
-        rot_mat_T[..., 1, 0] = rot_sin
-        rot_mat_T[..., 1, 1] = rot_cos
-        rot_mat_T[..., 2, 2] = 1
-
-        # Get the object surface center
-        offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0],
-                                      [0, -1, 0], [1, 0, 0], [-1, 0, 0]])
-        offset = offset.view(1, 6, 3) / 2
-        surface_3d = (offset *
-                      obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape(
-                          -1, 3)
-
-        # Get the object line center
-        offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1],
-                                      [0, -1, 1], [1, 0, -1], [-1, 0, -1],
-                                      [0, 1, -1], [0, -1, -1], [1, 1, 0],
-                                      [1, -1, 0], [-1, 1, 0], [-1, -1, 0]])
-        offset = offset.view(1, 12, 3) / 2
-
-        line_3d = (offset *
-                   obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape(
-                       -1, 3)
-
-        surface_rot = rot_mat_T.repeat(6, 1, 1)
-        surface_3d = torch.matmul(surface_3d.unsqueeze(-2),
-                                  surface_rot).squeeze(-2)
-        surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d
-
-        line_rot = rot_mat_T.repeat(12, 1, 1)
-        line_3d = torch.matmul(line_3d.unsqueeze(-2), line_rot).squeeze(-2)
-        line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d
-
-        return surface_center, line_center
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet3d.core.points import BasePoints
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis
+
+
+class DepthInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in Depth coordinates.
+
+    Coordinates in Depth:
+
+    .. code-block:: none
+
+                    up z    y front (yaw=-0.5*pi)
+                       ^   ^
+                       |  /
+                       | /
+                       0 ------> x right (yaw=0)
+
+    The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    The yaw is 0 at the positive direction of x axis, and decreases from
+    the positive direction of x to the positive direction of y.
+    Also note that rotation of DepthInstance3DBoxes is counterclockwise,
+    which is reverse to the definition of the yaw angle (clockwise).
+
+    A refactor is ongoing to make the three coordinate systems
+    easier to understand and convert between each other.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicates the dimension of a box
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+    YAW_AXIS = 2
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, :2] = bottom_center[:, :2]
+        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self):
+        """torch.Tensor: Coordinates of corners of all the boxes
+        in shape (N, 8, 3).
+
+        Convert the boxes to corners in clockwise order, in form of
+        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
+
+        .. code-block:: none
+
+                                           up z
+                            front y           ^
+                                 /            |
+                                /             |
+                  (x0, y1, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / origin    | /
+               (x0, y0, z0) + ----------- + --------> right x
+                                          (x1, y0, z0)
+        """
+        if self.tensor.numel() == 0:
+            return torch.empty([0, 8, 3], device=self.tensor.device)
+
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+                device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin (0.5, 0.5, 0)
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(
+            corners, self.tensor[:, 6], axis=self.YAW_AXIS)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angle (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns
+                None, otherwise it returns the rotated points and the
+                rotation matrix ``rot_mat_T``.
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, 0:3],
+                angle,
+                axis=self.YAW_AXIS,
+                return_mat=True)
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[0, 1]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+
+        if self.with_yaw:
+            self.tensor[:, 6] += angle
+        else:
+            # for axis-aligned boxes, we take the new
+            # enclosing axis-aligned boxes after rotation
+            corners_rot = self.corners @ rot_mat_T
+            new_x_size = corners_rot[..., 0].max(
+                dim=1, keepdim=True)[0] - corners_rot[..., 0].min(
+                    dim=1, keepdim=True)[0]
+            new_y_size = corners_rot[..., 1].max(
+                dim=1, keepdim=True)[0] - corners_rot[..., 1].min(
+                    dim=1, keepdim=True)[0]
+            self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1)
+
+        if points is not None:
+            if isinstance(points, torch.Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.cpu().numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                points.rotate(rot_mat_T)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(self, bev_direction='horizontal', points=None):
+        """Flip the boxes in BEV along given BEV direction.
+
+        In Depth coordinates, it flips x (horizontal) or y (vertical) axis.
+
+        Args:
+            bev_direction (str, optional): Flip direction
+                (horizontal or vertical). Defaults to 'horizontal'.
+            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            torch.Tensor, numpy.ndarray or None: Flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (torch.Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 0] = -points[:, 0]
+                elif bev_direction == 'vertical':
+                    points[:, 1] = -points[:, 1]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`Box3DMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from ``src`` coordinates to ``dst`` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`DepthInstance3DBoxes`:
+                The converted box of the same type in the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self, src=Box3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
+
+    def enlarged_box(self, extra_width):
+        """Enlarge the length, width and height boxes.
+
+        Args:
+            extra_width (float | torch.Tensor): Extra width to enlarge the box.
+
+        Returns:
+            :obj:`DepthInstance3DBoxes`: Enlarged boxes.
+        """
+        enlarged_boxes = self.tensor.clone()
+        enlarged_boxes[:, 3:6] += extra_width * 2
+        # bottom center z minus extra_width
+        enlarged_boxes[:, 2] -= extra_width
+        return self.new_box(enlarged_boxes)
+
+    def get_surface_line_center(self):
+        """Compute surface and line center of bounding boxes.
+
+        Returns:
+            torch.Tensor: Surface and line center of bounding boxes.
+        """
+        obj_size = self.dims
+        center = self.gravity_center.view(-1, 1, 3)
+        batch_size = center.shape[0]
+
+        rot_sin = torch.sin(-self.yaw)
+        rot_cos = torch.cos(-self.yaw)
+        rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3]))
+        rot_mat_T[..., 0, 0] = rot_cos
+        rot_mat_T[..., 0, 1] = -rot_sin
+        rot_mat_T[..., 1, 0] = rot_sin
+        rot_mat_T[..., 1, 1] = rot_cos
+        rot_mat_T[..., 2, 2] = 1
+
+        # Get the object surface center
+        offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0],
+                                      [0, -1, 0], [1, 0, 0], [-1, 0, 0]])
+        offset = offset.view(1, 6, 3) / 2
+        surface_3d = (offset *
+                      obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape(
+                          -1, 3)
+
+        # Get the object line center
+        offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1],
+                                      [0, -1, 1], [1, 0, -1], [-1, 0, -1],
+                                      [0, 1, -1], [0, -1, -1], [1, 1, 0],
+                                      [1, -1, 0], [-1, 1, 0], [-1, -1, 0]])
+        offset = offset.view(1, 12, 3) / 2
+
+        line_3d = (offset *
+                   obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape(
+                       -1, 3)
+
+        surface_rot = rot_mat_T.repeat(6, 1, 1)
+        surface_3d = torch.matmul(surface_3d.unsqueeze(-2),
+                                  surface_rot).squeeze(-2)
+        surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d
+
+        line_rot = rot_mat_T.repeat(12, 1, 1)
+        line_3d = torch.matmul(line_3d.unsqueeze(-2), line_rot).squeeze(-2)
+        line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d
+
+        return surface_center, line_center
diff --git a/mmdet3d/core/bbox/structures/lidar_box3d.py b/mmdet3d/core/bbox/structures/lidar_box3d.py
index 706a6c0..0ec9100 100644
--- a/mmdet3d/core/bbox/structures/lidar_box3d.py
+++ b/mmdet3d/core/bbox/structures/lidar_box3d.py
@@ -1,210 +1,210 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from mmdet3d.core.points import BasePoints
-from .base_box3d import BaseInstance3DBoxes
-from .utils import rotation_3d_in_axis
-
-
-class LiDARInstance3DBoxes(BaseInstance3DBoxes):
-    """3D boxes of instances in LIDAR coordinates.
-
-    Coordinates in LiDAR:
-
-    .. code-block:: none
-
-                                up z    x front (yaw=0)
-                                   ^   ^
-                                   |  /
-                                   | /
-       (yaw=0.5*pi) left y <------ 0
-
-    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
-    and the yaw is around the z axis, thus the rotation axis=2.
-    The yaw is 0 at the positive direction of x axis, and increases from
-    the positive direction of x to the positive direction of y.
-
-    A refactor is ongoing to make the three coordinate systems
-    easier to understand and convert between each other.
-
-    Attributes:
-        tensor (torch.Tensor): Float matrix of N x box_dim.
-        box_dim (int): Integer indicating the dimension of a box.
-            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
-        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
-            boxes.
-    """
-    YAW_AXIS = 2
-
-    @property
-    def gravity_center(self):
-        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
-        bottom_center = self.bottom_center
-        gravity_center = torch.zeros_like(bottom_center)
-        gravity_center[:, :2] = bottom_center[:, :2]
-        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
-        return gravity_center
-
-    @property
-    def corners(self):
-        """torch.Tensor: Coordinates of corners of all the boxes
-        in shape (N, 8, 3).
-
-        Convert the boxes to corners in clockwise order, in form of
-        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
-
-        .. code-block:: none
-
-                                           up z
-                            front x           ^
-                                 /            |
-                                /             |
-                  (x1, y0, z1) + -----------  + (x1, y1, z1)
-                              /|            / |
-                             / |           /  |
-               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
-                            |  /      .   |  /
-                            | / origin    | /
-            left y<-------- + ----------- + (x0, y1, z0)
-                (x0, y0, z0)
-        """
-        if self.tensor.numel() == 0:
-            return torch.empty([0, 8, 3], device=self.tensor.device)
-
-        dims = self.dims
-        corners_norm = torch.from_numpy(
-            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
-                device=dims.device, dtype=dims.dtype)
-
-        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
-        # use relative origin [0.5, 0.5, 0]
-        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
-        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
-
-        # rotate around z axis
-        corners = rotation_3d_in_axis(
-            corners, self.tensor[:, 6], axis=self.YAW_AXIS)
-        corners += self.tensor[:, :3].view(-1, 1, 3)
-        return corners
-
-    def rotate(self, angle, points=None):
-        """Rotate boxes with points (optional) with the given angle or rotation
-        matrix.
-
-        Args:
-            angles (float | torch.Tensor | np.ndarray):
-                Rotation angle or rotation matrix.
-            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
-                Points to rotate. Defaults to None.
-
-        Returns:
-            tuple or None: When ``points`` is None, the function returns
-                None, otherwise it returns the rotated points and the
-                rotation matrix ``rot_mat_T``.
-        """
-        if not isinstance(angle, torch.Tensor):
-            angle = self.tensor.new_tensor(angle)
-
-        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
-            f'invalid rotation angle shape {angle.shape}'
-
-        if angle.numel() == 1:
-            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
-                self.tensor[:, 0:3],
-                angle,
-                axis=self.YAW_AXIS,
-                return_mat=True)
-        else:
-            rot_mat_T = angle
-            rot_sin = rot_mat_T[0, 1]
-            rot_cos = rot_mat_T[0, 0]
-            angle = np.arctan2(rot_sin, rot_cos)
-            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
-
-        self.tensor[:, 6] += angle
-
-        if self.tensor.shape[1] == 9:
-            # rotate velo vector
-            self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2]
-
-        if points is not None:
-            if isinstance(points, torch.Tensor):
-                points[:, :3] = points[:, :3] @ rot_mat_T
-            elif isinstance(points, np.ndarray):
-                rot_mat_T = rot_mat_T.cpu().numpy()
-                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
-            elif isinstance(points, BasePoints):
-                points.rotate(rot_mat_T)
-            else:
-                raise ValueError
-            return points, rot_mat_T
-
-    def flip(self, bev_direction='horizontal', points=None):
-        """Flip the boxes in BEV along given BEV direction.
-
-        In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis.
-
-        Args:
-            bev_direction (str): Flip direction (horizontal or vertical).
-            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
-                Points to flip. Defaults to None.
-
-        Returns:
-            torch.Tensor, numpy.ndarray or None: Flipped points.
-        """
-        assert bev_direction in ('horizontal', 'vertical')
-        if bev_direction == 'horizontal':
-            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
-            if self.with_yaw:
-                self.tensor[:, 6] = -self.tensor[:, 6]
-        elif bev_direction == 'vertical':
-            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
-            if self.with_yaw:
-                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
-
-        if points is not None:
-            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
-            if isinstance(points, (torch.Tensor, np.ndarray)):
-                if bev_direction == 'horizontal':
-                    points[:, 1] = -points[:, 1]
-                elif bev_direction == 'vertical':
-                    points[:, 0] = -points[:, 0]
-            elif isinstance(points, BasePoints):
-                points.flip(bev_direction)
-            return points
-
-    def convert_to(self, dst, rt_mat=None):
-        """Convert self to ``dst`` mode.
-
-        Args:
-            dst (:obj:`Box3DMode`): the target Box mode
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from ``src`` coordinates to ``dst`` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-
-        Returns:
-            :obj:`BaseInstance3DBoxes`:
-                The converted box of the same type in the ``dst`` mode.
-        """
-        from .box_3d_mode import Box3DMode
-        return Box3DMode.convert(
-            box=self, src=Box3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
-
-    def enlarged_box(self, extra_width):
-        """Enlarge the length, width and height boxes.
-
-        Args:
-            extra_width (float | torch.Tensor): Extra width to enlarge the box.
-
-        Returns:
-            :obj:`LiDARInstance3DBoxes`: Enlarged boxes.
-        """
-        enlarged_boxes = self.tensor.clone()
-        enlarged_boxes[:, 3:6] += extra_width * 2
-        # bottom center z minus extra_width
-        enlarged_boxes[:, 2] -= extra_width
-        return self.new_box(enlarged_boxes)
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet3d.core.points import BasePoints
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis
+
+
+class LiDARInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in LIDAR coordinates.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                                up z    x front (yaw=0)
+                                   ^   ^
+                                   |  /
+                                   | /
+       (yaw=0.5*pi) left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    The yaw is 0 at the positive direction of x axis, and increases from
+    the positive direction of x to the positive direction of y.
+
+    A refactor is ongoing to make the three coordinate systems
+    easier to understand and convert between each other.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicating the dimension of a box.
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+    YAW_AXIS = 2
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box in shape (N, 3)."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, :2] = bottom_center[:, :2]
+        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self):
+        """torch.Tensor: Coordinates of corners of all the boxes
+        in shape (N, 8, 3).
+
+        Convert the boxes to corners in clockwise order, in form of
+        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
+
+        .. code-block:: none
+
+                                           up z
+                            front x           ^
+                                 /            |
+                                /             |
+                  (x1, y0, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / origin    | /
+            left y<-------- + ----------- + (x0, y1, z0)
+                (x0, y0, z0)
+        """
+        if self.tensor.numel() == 0:
+            return torch.empty([0, 8, 3], device=self.tensor.device)
+
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+                device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin [0.5, 0.5, 0]
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(
+            corners, self.tensor[:, 6], axis=self.YAW_AXIS)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or rotation
+        matrix.
+
+        Args:
+            angles (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns
+                None, otherwise it returns the rotated points and the
+                rotation matrix ``rot_mat_T``.
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, 0:3],
+                angle,
+                axis=self.YAW_AXIS,
+                return_mat=True)
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[0, 1]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+            self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+
+        self.tensor[:, 6] += angle
+
+        if self.tensor.shape[1] == 9:
+            # rotate velo vector
+            self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2]
+
+        if points is not None:
+            if isinstance(points, torch.Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.cpu().numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                points.rotate(rot_mat_T)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(self, bev_direction='horizontal', points=None):
+        """Flip the boxes in BEV along given BEV direction.
+
+        In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+            points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional):
+                Points to flip. Defaults to None.
+
+        Returns:
+            torch.Tensor, numpy.ndarray or None: Flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+
+        if points is not None:
+            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (torch.Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 1] = -points[:, 1]
+                elif bev_direction == 'vertical':
+                    points[:, 0] = -points[:, 0]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`Box3DMode`): the target Box mode
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from ``src`` coordinates to ``dst`` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`:
+                The converted box of the same type in the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self, src=Box3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
+
+    def enlarged_box(self, extra_width):
+        """Enlarge the length, width and height boxes.
+
+        Args:
+            extra_width (float | torch.Tensor): Extra width to enlarge the box.
+
+        Returns:
+            :obj:`LiDARInstance3DBoxes`: Enlarged boxes.
+        """
+        enlarged_boxes = self.tensor.clone()
+        enlarged_boxes[:, 3:6] += extra_width * 2
+        # bottom center z minus extra_width
+        enlarged_boxes[:, 2] -= extra_width
+        return self.new_box(enlarged_boxes)
diff --git a/mmdet3d/core/bbox/structures/utils.py b/mmdet3d/core/bbox/structures/utils.py
index 82a4c25..90b5763 100644
--- a/mmdet3d/core/bbox/structures/utils.py
+++ b/mmdet3d/core/bbox/structures/utils.py
@@ -1,335 +1,335 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from logging import warning
-
-import numpy as np
-import torch
-
-from mmdet3d.core.utils import array_converter
-
-
-@array_converter(apply_to=('val', ))
-def limit_period(val, offset=0.5, period=np.pi):
-    """Limit the value into a period for periodic function.
-
-    Args:
-        val (torch.Tensor | np.ndarray): The value to be converted.
-        offset (float, optional): Offset to set the value range.
-            Defaults to 0.5.
-        period ([type], optional): Period of the value. Defaults to np.pi.
-
-    Returns:
-        (torch.Tensor | np.ndarray): Value in the range of
-            [-offset * period, (1-offset) * period]
-    """
-    limited_val = val - torch.floor(val / period + offset) * period
-    return limited_val
-
-
-@array_converter(apply_to=('points', 'angles'))
-def rotation_3d_in_axis(points,
-                        angles,
-                        axis=0,
-                        return_mat=False,
-                        clockwise=False):
-    """Rotate points by angles according to axis.
-
-    Args:
-        points (np.ndarray | torch.Tensor | list | tuple ):
-            Points of shape (N, M, 3).
-        angles (np.ndarray | torch.Tensor | list | tuple | float):
-            Vector of angles in shape (N,)
-        axis (int, optional): The axis to be rotated. Defaults to 0.
-        return_mat: Whether or not return the rotation matrix (transposed).
-            Defaults to False.
-        clockwise: Whether the rotation is clockwise. Defaults to False.
-
-    Raises:
-        ValueError: when the axis is not in range [0, 1, 2], it will
-            raise value error.
-
-    Returns:
-        (torch.Tensor | np.ndarray): Rotated points in shape (N, M, 3).
-    """
-    batch_free = len(points.shape) == 2
-    if batch_free:
-        points = points[None]
-
-    if isinstance(angles, float) or len(angles.shape) == 0:
-        angles = torch.full(points.shape[:1], angles)
-
-    assert len(points.shape) == 3 and len(angles.shape) == 1 \
-        and points.shape[0] == angles.shape[0], f'Incorrect shape of points ' \
-        f'angles: {points.shape}, {angles.shape}'
-
-    assert points.shape[-1] in [2, 3], \
-        f'Points size should be 2 or 3 instead of {points.shape[-1]}'
-
-    rot_sin = torch.sin(angles)
-    rot_cos = torch.cos(angles)
-    ones = torch.ones_like(rot_cos)
-    zeros = torch.zeros_like(rot_cos)
-
-    if points.shape[-1] == 3:
-        if axis == 1 or axis == -2:
-            rot_mat_T = torch.stack([
-                torch.stack([rot_cos, zeros, -rot_sin]),
-                torch.stack([zeros, ones, zeros]),
-                torch.stack([rot_sin, zeros, rot_cos])
-            ])
-        elif axis == 2 or axis == -1:
-            rot_mat_T = torch.stack([
-                torch.stack([rot_cos, rot_sin, zeros]),
-                torch.stack([-rot_sin, rot_cos, zeros]),
-                torch.stack([zeros, zeros, ones])
-            ])
-        elif axis == 0 or axis == -3:
-            rot_mat_T = torch.stack([
-                torch.stack([ones, zeros, zeros]),
-                torch.stack([zeros, rot_cos, rot_sin]),
-                torch.stack([zeros, -rot_sin, rot_cos])
-            ])
-        else:
-            raise ValueError(f'axis should in range '
-                             f'[-3, -2, -1, 0, 1, 2], got {axis}')
-    else:
-        rot_mat_T = torch.stack([
-            torch.stack([rot_cos, rot_sin]),
-            torch.stack([-rot_sin, rot_cos])
-        ])
-
-    if clockwise:
-        rot_mat_T = rot_mat_T.transpose(0, 1)
-
-    if points.shape[0] == 0:
-        points_new = points
-    else:
-        points_new = torch.einsum('aij,jka->aik', points, rot_mat_T)
-
-    if batch_free:
-        points_new = points_new.squeeze(0)
-
-    if return_mat:
-        rot_mat_T = torch.einsum('jka->ajk', rot_mat_T)
-        if batch_free:
-            rot_mat_T = rot_mat_T.squeeze(0)
-        return points_new, rot_mat_T
-    else:
-        return points_new
-
-
-@array_converter(apply_to=('boxes_xywhr', ))
-def xywhr2xyxyr(boxes_xywhr):
-    """Convert a rotated boxes in XYWHR format to XYXYR format.
-
-    Args:
-        boxes_xywhr (torch.Tensor | np.ndarray): Rotated boxes in XYWHR format.
-
-    Returns:
-        (torch.Tensor | np.ndarray): Converted boxes in XYXYR format.
-    """
-    boxes = torch.zeros_like(boxes_xywhr)
-    half_w = boxes_xywhr[..., 2] / 2
-    half_h = boxes_xywhr[..., 3] / 2
-
-    boxes[..., 0] = boxes_xywhr[..., 0] - half_w
-    boxes[..., 1] = boxes_xywhr[..., 1] - half_h
-    boxes[..., 2] = boxes_xywhr[..., 0] + half_w
-    boxes[..., 3] = boxes_xywhr[..., 1] + half_h
-    boxes[..., 4] = boxes_xywhr[..., 4]
-    return boxes
-
-
-def get_box_type(box_type):
-    """Get the type and mode of box structure.
-
-    Args:
-        box_type (str): The type of box structure.
-            The valid value are "LiDAR", "Camera", or "Depth".
-
-    Raises:
-        ValueError: A ValueError is raised when `box_type`
-            does not belong to the three valid types.
-
-    Returns:
-        tuple: Box type and box mode.
-    """
-    from .box_3d_mode import (Box3DMode, CameraInstance3DBoxes,
-                              DepthInstance3DBoxes, LiDARInstance3DBoxes)
-    box_type_lower = box_type.lower()
-    if box_type_lower == 'lidar':
-        box_type_3d = LiDARInstance3DBoxes
-        box_mode_3d = Box3DMode.LIDAR
-    elif box_type_lower == 'camera':
-        box_type_3d = CameraInstance3DBoxes
-        box_mode_3d = Box3DMode.CAM
-    elif box_type_lower == 'depth':
-        box_type_3d = DepthInstance3DBoxes
-        box_mode_3d = Box3DMode.DEPTH
-    else:
-        raise ValueError('Only "box_type" of "camera", "lidar", "depth"'
-                         f' are supported, got {box_type}')
-
-    return box_type_3d, box_mode_3d
-
-
-@array_converter(apply_to=('points_3d', 'proj_mat'))
-def points_cam2img(points_3d, proj_mat, with_depth=False):
-    """Project points in camera coordinates to image coordinates.
-
-    Args:
-        points_3d (torch.Tensor | np.ndarray): Points in shape (N, 3)
-        proj_mat (torch.Tensor | np.ndarray):
-            Transformation matrix between coordinates.
-        with_depth (bool, optional): Whether to keep depth in the output.
-            Defaults to False.
-
-    Returns:
-        (torch.Tensor | np.ndarray): Points in image coordinates,
-            with shape [N, 2] if `with_depth=False`, else [N, 3].
-    """
-    points_shape = list(points_3d.shape)
-    points_shape[-1] = 1
-
-    assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
-        f' matrix should be 2 instead of {len(proj_mat.shape)}.'
-    d1, d2 = proj_mat.shape[:2]
-    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
-        d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
-        f' ({d1}*{d2}) is not supported.'
-    if d1 == 3:
-        proj_mat_expanded = torch.eye(
-            4, device=proj_mat.device, dtype=proj_mat.dtype)
-        proj_mat_expanded[:d1, :d2] = proj_mat
-        proj_mat = proj_mat_expanded
-
-    # previous implementation use new_zeros, new_one yields better results
-    points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1)
-
-    point_2d = points_4 @ proj_mat.T
-    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
-
-    if with_depth:
-        point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)
-
-    return point_2d_res
-
-
-@array_converter(apply_to=('points', 'cam2img'))
-def points_img2cam(points, cam2img):
-    """Project points in image coordinates to camera coordinates.
-
-    Args:
-        points (torch.Tensor): 2.5D points in 2D images, [N, 3],
-            3 corresponds with x, y in the image and depth.
-        cam2img (torch.Tensor): Camera intrinsic matrix. The shape can be
-            [3, 3], [3, 4] or [4, 4].
-
-    Returns:
-        torch.Tensor: points in 3D space. [N, 3],
-            3 corresponds with x, y, z in 3D space.
-    """
-    assert cam2img.shape[0] <= 4
-    assert cam2img.shape[1] <= 4
-    assert points.shape[1] == 3
-
-    xys = points[:, :2]
-    depths = points[:, 2].view(-1, 1)
-    unnormed_xys = torch.cat([xys * depths, depths], dim=1)
-
-    pad_cam2img = torch.eye(4, dtype=xys.dtype, device=xys.device)
-    pad_cam2img[:cam2img.shape[0], :cam2img.shape[1]] = cam2img
-    inv_pad_cam2img = torch.inverse(pad_cam2img).transpose(0, 1)
-
-    # Do operation in homogeneous coordinates.
-    num_points = unnormed_xys.shape[0]
-    homo_xys = torch.cat([unnormed_xys, xys.new_ones((num_points, 1))], dim=1)
-    points3D = torch.mm(homo_xys, inv_pad_cam2img)[:, :3]
-
-    return points3D
-
-
-def mono_cam_box2vis(cam_box):
-    """This is a post-processing function on the bboxes from Mono-3D task. If
-    we want to perform projection visualization, we need to:
-
-        1. rotate the box along x-axis for np.pi / 2 (roll)
-        2. change orientation from local yaw to global yaw
-        3. convert yaw by (np.pi / 2 - yaw)
-
-    After applying this function, we can project and draw it on 2D images.
-
-    Args:
-        cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate
-            system before conversion. Could be gt bbox loaded from dataset
-            or network prediction output.
-
-    Returns:
-        :obj:`CameraInstance3DBoxes`: Box after conversion.
-    """
-    warning.warn('DeprecationWarning: The hack of yaw and dimension in the '
-                 'monocular 3D detection on nuScenes has been removed. The '
-                 'function mono_cam_box2vis will be deprecated.')
-    from . import CameraInstance3DBoxes
-    assert isinstance(cam_box, CameraInstance3DBoxes), \
-        'input bbox should be CameraInstance3DBoxes!'
-
-    loc = cam_box.gravity_center
-    dim = cam_box.dims
-    yaw = cam_box.yaw
-    feats = cam_box.tensor[:, 7:]
-    # rotate along x-axis for np.pi / 2
-    # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557  # noqa
-    dim[:, [1, 2]] = dim[:, [2, 1]]
-    # change local yaw to global yaw for visualization
-    # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166  # noqa
-    yaw += torch.atan2(loc[:, 0], loc[:, 2])
-    # convert yaw by (-yaw - np.pi / 2)
-    # this is because mono 3D box class such as `NuScenesBox` has different
-    # definition of rotation with our `CameraInstance3DBoxes`
-    yaw = -yaw - np.pi / 2
-    cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1)
-    cam_box = CameraInstance3DBoxes(
-        cam_box, box_dim=cam_box.shape[-1], origin=(0.5, 0.5, 0.5))
-
-    return cam_box
-
-
-def get_proj_mat_by_coord_type(img_meta, coord_type):
-    """Obtain image features using points.
-
-    Args:
-        img_meta (dict): Meta info.
-        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
-            Can be case-insensitive.
-
-    Returns:
-        torch.Tensor: transformation matrix.
-    """
-    coord_type = coord_type.upper()
-    mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'}
-    assert coord_type in mapping.keys()
-    return img_meta[mapping[coord_type]]
-
-
-def yaw2local(yaw, loc):
-    """Transform global yaw to local yaw (alpha in kitti) in camera
-    coordinates, ranges from -pi to pi.
-
-    Args:
-        yaw (torch.Tensor): A vector with local yaw of each box.
-            shape: (N, )
-        loc (torch.Tensor): gravity center of each box.
-            shape: (N, 3)
-
-    Returns:
-        torch.Tensor: local yaw (alpha in kitti).
-    """
-    local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2])
-    larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False)
-    small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False)
-    if len(larger_idx) != 0:
-        local_yaw[larger_idx] -= 2 * np.pi
-    if len(small_idx) != 0:
-        local_yaw[small_idx] += 2 * np.pi
-
-    return local_yaw
+# Copyright (c) OpenMMLab. All rights reserved.
+from logging import warning
+
+import numpy as np
+import torch
+
+from mmdet3d.core.utils import array_converter
+
+
+@array_converter(apply_to=('val', ))
+def limit_period(val, offset=0.5, period=np.pi):
+    """Limit the value into a period for periodic function.
+
+    Args:
+        val (torch.Tensor | np.ndarray): The value to be converted.
+        offset (float, optional): Offset to set the value range.
+            Defaults to 0.5.
+        period ([type], optional): Period of the value. Defaults to np.pi.
+
+    Returns:
+        (torch.Tensor | np.ndarray): Value in the range of
+            [-offset * period, (1-offset) * period]
+    """
+    limited_val = val - torch.floor(val / period + offset) * period
+    return limited_val
+
+
+@array_converter(apply_to=('points', 'angles'))
+def rotation_3d_in_axis(points,
+                        angles,
+                        axis=0,
+                        return_mat=False,
+                        clockwise=False):
+    """Rotate points by angles according to axis.
+
+    Args:
+        points (np.ndarray | torch.Tensor | list | tuple ):
+            Points of shape (N, M, 3).
+        angles (np.ndarray | torch.Tensor | list | tuple | float):
+            Vector of angles in shape (N,)
+        axis (int, optional): The axis to be rotated. Defaults to 0.
+        return_mat: Whether or not return the rotation matrix (transposed).
+            Defaults to False.
+        clockwise: Whether the rotation is clockwise. Defaults to False.
+
+    Raises:
+        ValueError: when the axis is not in range [0, 1, 2], it will
+            raise value error.
+
+    Returns:
+        (torch.Tensor | np.ndarray): Rotated points in shape (N, M, 3).
+    """
+    batch_free = len(points.shape) == 2
+    if batch_free:
+        points = points[None]
+
+    if isinstance(angles, float) or len(angles.shape) == 0:
+        angles = torch.full(points.shape[:1], angles)
+
+    assert len(points.shape) == 3 and len(angles.shape) == 1 \
+        and points.shape[0] == angles.shape[0], f'Incorrect shape of points ' \
+        f'angles: {points.shape}, {angles.shape}'
+
+    assert points.shape[-1] in [2, 3], \
+        f'Points size should be 2 or 3 instead of {points.shape[-1]}'
+
+    rot_sin = torch.sin(angles)
+    rot_cos = torch.cos(angles)
+    ones = torch.ones_like(rot_cos)
+    zeros = torch.zeros_like(rot_cos)
+
+    if points.shape[-1] == 3:
+        if axis == 1 or axis == -2:
+            rot_mat_T = torch.stack([
+                torch.stack([rot_cos, zeros, -rot_sin]),
+                torch.stack([zeros, ones, zeros]),
+                torch.stack([rot_sin, zeros, rot_cos])
+            ])
+        elif axis == 2 or axis == -1:
+            rot_mat_T = torch.stack([
+                torch.stack([rot_cos, rot_sin, zeros]),
+                torch.stack([-rot_sin, rot_cos, zeros]),
+                torch.stack([zeros, zeros, ones])
+            ])
+        elif axis == 0 or axis == -3:
+            rot_mat_T = torch.stack([
+                torch.stack([ones, zeros, zeros]),
+                torch.stack([zeros, rot_cos, rot_sin]),
+                torch.stack([zeros, -rot_sin, rot_cos])
+            ])
+        else:
+            raise ValueError(f'axis should in range '
+                             f'[-3, -2, -1, 0, 1, 2], got {axis}')
+    else:
+        rot_mat_T = torch.stack([
+            torch.stack([rot_cos, rot_sin]),
+            torch.stack([-rot_sin, rot_cos])
+        ])
+
+    if clockwise:
+        rot_mat_T = rot_mat_T.transpose(0, 1)
+
+    if points.shape[0] == 0:
+        points_new = points
+    else:
+        points_new = torch.einsum('aij,jka->aik', points, rot_mat_T)
+
+    if batch_free:
+        points_new = points_new.squeeze(0)
+
+    if return_mat:
+        rot_mat_T = torch.einsum('jka->ajk', rot_mat_T)
+        if batch_free:
+            rot_mat_T = rot_mat_T.squeeze(0)
+        return points_new, rot_mat_T
+    else:
+        return points_new
+
+
+@array_converter(apply_to=('boxes_xywhr', ))
+def xywhr2xyxyr(boxes_xywhr):
+    """Convert a rotated boxes in XYWHR format to XYXYR format.
+
+    Args:
+        boxes_xywhr (torch.Tensor | np.ndarray): Rotated boxes in XYWHR format.
+
+    Returns:
+        (torch.Tensor | np.ndarray): Converted boxes in XYXYR format.
+    """
+    boxes = torch.zeros_like(boxes_xywhr)
+    half_w = boxes_xywhr[..., 2] / 2
+    half_h = boxes_xywhr[..., 3] / 2
+
+    boxes[..., 0] = boxes_xywhr[..., 0] - half_w
+    boxes[..., 1] = boxes_xywhr[..., 1] - half_h
+    boxes[..., 2] = boxes_xywhr[..., 0] + half_w
+    boxes[..., 3] = boxes_xywhr[..., 1] + half_h
+    boxes[..., 4] = boxes_xywhr[..., 4]
+    return boxes
+
+
+def get_box_type(box_type):
+    """Get the type and mode of box structure.
+
+    Args:
+        box_type (str): The type of box structure.
+            The valid value are "LiDAR", "Camera", or "Depth".
+
+    Raises:
+        ValueError: A ValueError is raised when `box_type`
+            does not belong to the three valid types.
+
+    Returns:
+        tuple: Box type and box mode.
+    """
+    from .box_3d_mode import (Box3DMode, CameraInstance3DBoxes,
+                              DepthInstance3DBoxes, LiDARInstance3DBoxes)
+    box_type_lower = box_type.lower()
+    if box_type_lower == 'lidar':
+        box_type_3d = LiDARInstance3DBoxes
+        box_mode_3d = Box3DMode.LIDAR
+    elif box_type_lower == 'camera':
+        box_type_3d = CameraInstance3DBoxes
+        box_mode_3d = Box3DMode.CAM
+    elif box_type_lower == 'depth':
+        box_type_3d = DepthInstance3DBoxes
+        box_mode_3d = Box3DMode.DEPTH
+    else:
+        raise ValueError('Only "box_type" of "camera", "lidar", "depth"'
+                         f' are supported, got {box_type}')
+
+    return box_type_3d, box_mode_3d
+
+
+@array_converter(apply_to=('points_3d', 'proj_mat'))
+def points_cam2img(points_3d, proj_mat, with_depth=False):
+    """Project points in camera coordinates to image coordinates.
+
+    Args:
+        points_3d (torch.Tensor | np.ndarray): Points in shape (N, 3)
+        proj_mat (torch.Tensor | np.ndarray):
+            Transformation matrix between coordinates.
+        with_depth (bool, optional): Whether to keep depth in the output.
+            Defaults to False.
+
+    Returns:
+        (torch.Tensor | np.ndarray): Points in image coordinates,
+            with shape [N, 2] if `with_depth=False`, else [N, 3].
+    """
+    points_shape = list(points_3d.shape)
+    points_shape[-1] = 1
+
+    assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
+        f' matrix should be 2 instead of {len(proj_mat.shape)}.'
+    d1, d2 = proj_mat.shape[:2]
+    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
+        d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
+        f' ({d1}*{d2}) is not supported.'
+    if d1 == 3:
+        proj_mat_expanded = torch.eye(
+            4, device=proj_mat.device, dtype=proj_mat.dtype)
+        proj_mat_expanded[:d1, :d2] = proj_mat
+        proj_mat = proj_mat_expanded
+
+    # previous implementation use new_zeros, new_one yields better results
+    points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1)
+
+    point_2d = points_4 @ proj_mat.T
+    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+
+    if with_depth:
+        point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)
+
+    return point_2d_res
+
+
+@array_converter(apply_to=('points', 'cam2img'))
+def points_img2cam(points, cam2img):
+    """Project points in image coordinates to camera coordinates.
+
+    Args:
+        points (torch.Tensor): 2.5D points in 2D images, [N, 3],
+            3 corresponds with x, y in the image and depth.
+        cam2img (torch.Tensor): Camera intrinsic matrix. The shape can be
+            [3, 3], [3, 4] or [4, 4].
+
+    Returns:
+        torch.Tensor: points in 3D space. [N, 3],
+            3 corresponds with x, y, z in 3D space.
+    """
+    assert cam2img.shape[0] <= 4
+    assert cam2img.shape[1] <= 4
+    assert points.shape[1] == 3
+
+    xys = points[:, :2]
+    depths = points[:, 2].view(-1, 1)
+    unnormed_xys = torch.cat([xys * depths, depths], dim=1)
+
+    pad_cam2img = torch.eye(4, dtype=xys.dtype, device=xys.device)
+    pad_cam2img[:cam2img.shape[0], :cam2img.shape[1]] = cam2img
+    inv_pad_cam2img = torch.inverse(pad_cam2img).transpose(0, 1)
+
+    # Do operation in homogeneous coordinates.
+    num_points = unnormed_xys.shape[0]
+    homo_xys = torch.cat([unnormed_xys, xys.new_ones((num_points, 1))], dim=1)
+    points3D = torch.mm(homo_xys, inv_pad_cam2img)[:, :3]
+
+    return points3D
+
+
+def mono_cam_box2vis(cam_box):
+    """This is a post-processing function on the bboxes from Mono-3D task. If
+    we want to perform projection visualization, we need to:
+
+        1. rotate the box along x-axis for np.pi / 2 (roll)
+        2. change orientation from local yaw to global yaw
+        3. convert yaw by (np.pi / 2 - yaw)
+
+    After applying this function, we can project and draw it on 2D images.
+
+    Args:
+        cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate
+            system before conversion. Could be gt bbox loaded from dataset
+            or network prediction output.
+
+    Returns:
+        :obj:`CameraInstance3DBoxes`: Box after conversion.
+    """
+    warning.warn('DeprecationWarning: The hack of yaw and dimension in the '
+                 'monocular 3D detection on nuScenes has been removed. The '
+                 'function mono_cam_box2vis will be deprecated.')
+    from . import CameraInstance3DBoxes
+    assert isinstance(cam_box, CameraInstance3DBoxes), \
+        'input bbox should be CameraInstance3DBoxes!'
+
+    loc = cam_box.gravity_center
+    dim = cam_box.dims
+    yaw = cam_box.yaw
+    feats = cam_box.tensor[:, 7:]
+    # rotate along x-axis for np.pi / 2
+    # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557  # noqa
+    dim[:, [1, 2]] = dim[:, [2, 1]]
+    # change local yaw to global yaw for visualization
+    # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166  # noqa
+    yaw += torch.atan2(loc[:, 0], loc[:, 2])
+    # convert yaw by (-yaw - np.pi / 2)
+    # this is because mono 3D box class such as `NuScenesBox` has different
+    # definition of rotation with our `CameraInstance3DBoxes`
+    yaw = -yaw - np.pi / 2
+    cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1)
+    cam_box = CameraInstance3DBoxes(
+        cam_box, box_dim=cam_box.shape[-1], origin=(0.5, 0.5, 0.5))
+
+    return cam_box
+
+
+def get_proj_mat_by_coord_type(img_meta, coord_type):
+    """Obtain image features using points.
+
+    Args:
+        img_meta (dict): Meta info.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
+            Can be case-insensitive.
+
+    Returns:
+        torch.Tensor: transformation matrix.
+    """
+    coord_type = coord_type.upper()
+    mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'}
+    assert coord_type in mapping.keys()
+    return img_meta[mapping[coord_type]]
+
+
+def yaw2local(yaw, loc):
+    """Transform global yaw to local yaw (alpha in kitti) in camera
+    coordinates, ranges from -pi to pi.
+
+    Args:
+        yaw (torch.Tensor): A vector with local yaw of each box.
+            shape: (N, )
+        loc (torch.Tensor): gravity center of each box.
+            shape: (N, 3)
+
+    Returns:
+        torch.Tensor: local yaw (alpha in kitti).
+    """
+    local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2])
+    larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False)
+    small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False)
+    if len(larger_idx) != 0:
+        local_yaw[larger_idx] -= 2 * np.pi
+    if len(small_idx) != 0:
+        local_yaw[small_idx] += 2 * np.pi
+
+    return local_yaw
diff --git a/mmdet3d/core/bbox/transforms.py b/mmdet3d/core/bbox/transforms.py
index 8a2eb90..f02f573 100644
--- a/mmdet3d/core/bbox/transforms.py
+++ b/mmdet3d/core/bbox/transforms.py
@@ -1,76 +1,76 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-
-def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical):
-    """Map bboxes from testing scale to original image scale.
-
-    Args:
-        bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
-        scale_factor (float): Scale factor.
-        flip_horizontal (bool): Whether to flip horizontally.
-        flip_vertical (bool): Whether to flip vertically.
-
-    Returns:
-        :obj:`BaseInstance3DBoxes`: Boxes mapped back.
-    """
-    new_bboxes = bboxes.clone()
-    if flip_horizontal:
-        new_bboxes.flip('horizontal')
-    if flip_vertical:
-        new_bboxes.flip('vertical')
-    new_bboxes.scale(1 / scale_factor)
-
-    return new_bboxes
-
-
-def bbox3d2roi(bbox_list):
-    """Convert a list of bounding boxes to roi format.
-
-    Args:
-        bbox_list (list[torch.Tensor]): A list of bounding boxes
-            corresponding to a batch of images.
-
-    Returns:
-        torch.Tensor: Region of interests in shape (n, c), where
-            the channels are in order of [batch_ind, x, y ...].
-    """
-    rois_list = []
-    for img_id, bboxes in enumerate(bbox_list):
-        if bboxes.size(0) > 0:
-            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
-            rois = torch.cat([img_inds, bboxes], dim=-1)
-        else:
-            rois = torch.zeros_like(bboxes)
-        rois_list.append(rois)
-    rois = torch.cat(rois_list, 0)
-    return rois
-
-
-def bbox3d2result(bboxes, scores, labels, attrs=None):
-    """Convert detection results to a list of numpy arrays.
-
-    Args:
-        bboxes (torch.Tensor): Bounding boxes with shape (N, 5).
-        labels (torch.Tensor): Labels with shape (N, ).
-        scores (torch.Tensor): Scores with shape (N, ).
-        attrs (torch.Tensor, optional): Attributes with shape (N, ).
-            Defaults to None.
-
-    Returns:
-        dict[str, torch.Tensor]: Bounding box results in cpu mode.
-
-            - boxes_3d (torch.Tensor): 3D boxes.
-            - scores (torch.Tensor): Prediction scores.
-            - labels_3d (torch.Tensor): Box labels.
-            - attrs_3d (torch.Tensor, optional): Box attributes.
-    """
-    result_dict = dict(
-        boxes_3d=bboxes.to('cpu'),
-        scores_3d=scores.cpu(),
-        labels_3d=labels.cpu())
-
-    if attrs is not None:
-        result_dict['attrs_3d'] = attrs.cpu()
-
-    return result_dict
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical):
+    """Map bboxes from testing scale to original image scale.
+
+    Args:
+        bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
+        scale_factor (float): Scale factor.
+        flip_horizontal (bool): Whether to flip horizontally.
+        flip_vertical (bool): Whether to flip vertically.
+
+    Returns:
+        :obj:`BaseInstance3DBoxes`: Boxes mapped back.
+    """
+    new_bboxes = bboxes.clone()
+    if flip_horizontal:
+        new_bboxes.flip('horizontal')
+    if flip_vertical:
+        new_bboxes.flip('vertical')
+    new_bboxes.scale(1 / scale_factor)
+
+    return new_bboxes
+
+
+def bbox3d2roi(bbox_list):
+    """Convert a list of bounding boxes to roi format.
+
+    Args:
+        bbox_list (list[torch.Tensor]): A list of bounding boxes
+            corresponding to a batch of images.
+
+    Returns:
+        torch.Tensor: Region of interests in shape (n, c), where
+            the channels are in order of [batch_ind, x, y ...].
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes], dim=-1)
+        else:
+            rois = torch.zeros_like(bboxes)
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def bbox3d2result(bboxes, scores, labels, attrs=None):
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (torch.Tensor): Bounding boxes with shape (N, 5).
+        labels (torch.Tensor): Labels with shape (N, ).
+        scores (torch.Tensor): Scores with shape (N, ).
+        attrs (torch.Tensor, optional): Attributes with shape (N, ).
+            Defaults to None.
+
+    Returns:
+        dict[str, torch.Tensor]: Bounding box results in cpu mode.
+
+            - boxes_3d (torch.Tensor): 3D boxes.
+            - scores (torch.Tensor): Prediction scores.
+            - labels_3d (torch.Tensor): Box labels.
+            - attrs_3d (torch.Tensor, optional): Box attributes.
+    """
+    result_dict = dict(
+        boxes_3d=bboxes.to('cpu'),
+        scores_3d=scores.cpu(),
+        labels_3d=labels.cpu())
+
+    if attrs is not None:
+        result_dict['attrs_3d'] = attrs.cpu()
+
+    return result_dict
diff --git a/mmdet3d/core/evaluation/__init__.py b/mmdet3d/core/evaluation/__init__.py
index b1d489f..7e81354 100644
--- a/mmdet3d/core/evaluation/__init__.py
+++ b/mmdet3d/core/evaluation/__init__.py
@@ -1,11 +1,11 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .indoor_eval import indoor_eval
-from .instance_seg_eval import instance_seg_eval
-from .kitti_utils import kitti_eval, kitti_eval_coco_style
-from .lyft_eval import lyft_eval
-from .seg_eval import seg_eval
-
-__all__ = [
-    'kitti_eval_coco_style', 'kitti_eval', 'indoor_eval', 'lyft_eval',
-    'seg_eval', 'instance_seg_eval'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .indoor_eval import indoor_eval
+from .instance_seg_eval import instance_seg_eval
+from .kitti_utils import kitti_eval, kitti_eval_coco_style
+from .lyft_eval import lyft_eval
+from .seg_eval import seg_eval
+
+__all__ = [
+    'kitti_eval_coco_style', 'kitti_eval', 'indoor_eval', 'lyft_eval',
+    'seg_eval', 'instance_seg_eval'
+]
diff --git a/mmdet3d/core/evaluation/indoor_eval.py b/mmdet3d/core/evaluation/indoor_eval.py
index 2ff9877..b384d78 100644
--- a/mmdet3d/core/evaluation/indoor_eval.py
+++ b/mmdet3d/core/evaluation/indoor_eval.py
@@ -1,309 +1,309 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from mmcv.utils import print_log
-from terminaltables import AsciiTable
-
-
-def average_precision(recalls, precisions, mode='area'):
-    """Calculate average precision (for single or multiple scales).
-
-    Args:
-        recalls (np.ndarray): Recalls with shape of (num_scales, num_dets)
-            or (num_dets, ).
-        precisions (np.ndarray): Precisions with shape of
-            (num_scales, num_dets) or (num_dets, ).
-        mode (str): 'area' or '11points', 'area' means calculating the area
-            under precision-recall curve, '11points' means calculating
-            the average precision of recalls at [0, 0.1, ..., 1]
-
-    Returns:
-        float or np.ndarray: Calculated average precision.
-    """
-    if recalls.ndim == 1:
-        recalls = recalls[np.newaxis, :]
-        precisions = precisions[np.newaxis, :]
-
-    assert recalls.shape == precisions.shape
-    assert recalls.ndim == 2
-
-    num_scales = recalls.shape[0]
-    ap = np.zeros(num_scales, dtype=np.float32)
-    if mode == 'area':
-        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
-        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
-        mrec = np.hstack((zeros, recalls, ones))
-        mpre = np.hstack((zeros, precisions, zeros))
-        for i in range(mpre.shape[1] - 1, 0, -1):
-            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
-        for i in range(num_scales):
-            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
-            ap[i] = np.sum(
-                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
-    elif mode == '11points':
-        for i in range(num_scales):
-            for thr in np.arange(0, 1 + 1e-3, 0.1):
-                precs = precisions[i, recalls[i, :] >= thr]
-                prec = precs.max() if precs.size > 0 else 0
-                ap[i] += prec
-            ap /= 11
-    else:
-        raise ValueError(
-            'Unrecognized mode, only "area" and "11points" are supported')
-    return ap
-
-
-def eval_det_cls(pred, gt, iou_thr=None):
-    """Generic functions to compute precision/recall for object detection for a
-    single class.
-
-    Args:
-        pred (dict): Predictions mapping from image id to bounding boxes
-            and scores.
-        gt (dict): Ground truths mapping from image id to bounding boxes.
-        iou_thr (list[float]): A list of iou thresholds.
-
-    Return:
-        tuple (np.ndarray, np.ndarray, float): Recalls, precisions and
-            average precision.
-    """
-
-    # {img_id: {'bbox': box structure, 'det': matched list}}
-    class_recs = {}
-    npos = 0
-    for img_id in gt.keys():
-        cur_gt_num = len(gt[img_id])
-        if cur_gt_num != 0:
-            gt_cur = torch.zeros([cur_gt_num, 7], dtype=torch.float32)
-            for i in range(cur_gt_num):
-                gt_cur[i] = gt[img_id][i].tensor
-            bbox = gt[img_id][0].new_box(gt_cur)
-        else:
-            bbox = gt[img_id]
-        det = [[False] * len(bbox) for i in iou_thr]
-        npos += len(bbox)
-        class_recs[img_id] = {'bbox': bbox, 'det': det}
-
-    # construct dets
-    image_ids = []
-    confidence = []
-    ious = []
-    for img_id in pred.keys():
-        cur_num = len(pred[img_id])
-        if cur_num == 0:
-            continue
-        pred_cur = torch.zeros((cur_num, 7), dtype=torch.float32)
-        box_idx = 0
-        for box, score in pred[img_id]:
-            image_ids.append(img_id)
-            confidence.append(score)
-            pred_cur[box_idx] = box.tensor
-            box_idx += 1
-        pred_cur = box.new_box(pred_cur)
-        gt_cur = class_recs[img_id]['bbox']
-        if len(gt_cur) > 0:
-            # calculate iou in each image
-            iou_cur = pred_cur.overlaps(pred_cur, gt_cur)
-            for i in range(cur_num):
-                ious.append(iou_cur[i])
-        else:
-            for i in range(cur_num):
-                ious.append(np.zeros(1))
-
-    confidence = np.array(confidence)
-
-    # sort by confidence
-    sorted_ind = np.argsort(-confidence)
-    image_ids = [image_ids[x] for x in sorted_ind]
-    ious = [ious[x] for x in sorted_ind]
-
-    # go down dets and mark TPs and FPs
-    nd = len(image_ids)
-    tp_thr = [np.zeros(nd) for i in iou_thr]
-    fp_thr = [np.zeros(nd) for i in iou_thr]
-    for d in range(nd):
-        R = class_recs[image_ids[d]]
-        iou_max = -np.inf
-        BBGT = R['bbox']
-        cur_iou = ious[d]
-
-        if len(BBGT) > 0:
-            # compute overlaps
-            for j in range(len(BBGT)):
-                # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
-                iou = cur_iou[j]
-                if iou > iou_max:
-                    iou_max = iou
-                    jmax = j
-
-        for iou_idx, thresh in enumerate(iou_thr):
-            if iou_max > thresh:
-                if not R['det'][iou_idx][jmax]:
-                    tp_thr[iou_idx][d] = 1.
-                    R['det'][iou_idx][jmax] = 1
-                else:
-                    fp_thr[iou_idx][d] = 1.
-            else:
-                fp_thr[iou_idx][d] = 1.
-
-    ret = []
-    for iou_idx, thresh in enumerate(iou_thr):
-        # compute precision recall
-        fp = np.cumsum(fp_thr[iou_idx])
-        tp = np.cumsum(tp_thr[iou_idx])
-        recall = tp / float(npos)
-        # avoid divide by zero in case the first detection matches a difficult
-        # ground truth
-        precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
-        ap = average_precision(recall, precision)
-        ret.append((recall, precision, ap))
-
-    return ret
-
-
-def eval_map_recall(pred, gt, ovthresh=None):
-    """Evaluate mAP and recall.
-
-    Generic functions to compute precision/recall for object detection
-        for multiple classes.
-
-    Args:
-        pred (dict): Information of detection results,
-            which maps class_id and predictions.
-        gt (dict): Information of ground truths, which maps class_id and
-            ground truths.
-        ovthresh (list[float], optional): iou threshold. Default: None.
-
-    Return:
-        tuple[dict]: dict results of recall, AP, and precision for all classes.
-    """
-
-    ret_values = {}
-    for classname in gt.keys():
-        if classname in pred:
-            ret_values[classname] = eval_det_cls(pred[classname],
-                                                 gt[classname], ovthresh)
-    recall = [{} for i in ovthresh]
-    precision = [{} for i in ovthresh]
-    ap = [{} for i in ovthresh]
-
-    for label in gt.keys():
-        for iou_idx, thresh in enumerate(ovthresh):
-            if label in pred:
-                recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][
-                    label] = ret_values[label][iou_idx]
-            else:
-                recall[iou_idx][label] = np.zeros(1)
-                precision[iou_idx][label] = np.zeros(1)
-                ap[iou_idx][label] = np.zeros(1)
-
-    return recall, precision, ap
-
-
-def indoor_eval(gt_annos,
-                dt_annos,
-                metric,
-                label2cat,
-                logger=None,
-                box_type_3d=None,
-                box_mode_3d=None):
-    """Indoor Evaluation.
-
-    Evaluate the result of the detection.
-
-    Args:
-        gt_annos (list[dict]): Ground truth annotations.
-        dt_annos (list[dict]): Detection annotations. the dict
-            includes the following keys
-
-            - labels_3d (torch.Tensor): Labels of boxes.
-            - boxes_3d (:obj:`BaseInstance3DBoxes`):
-                3D bounding boxes in Depth coordinate.
-            - scores_3d (torch.Tensor): Scores of boxes.
-        metric (list[float]): IoU thresholds for computing average precisions.
-        label2cat (dict): Map from label to category.
-        logger (logging.Logger | str, optional): The way to print the mAP
-            summary. See `mmdet.utils.print_log()` for details. Default: None.
-
-    Return:
-        dict[str, float]: Dict of results.
-    """
-    assert len(dt_annos) == len(gt_annos)
-    pred = {}  # map {class_id: pred}
-    gt = {}  # map {class_id: gt}
-    for img_id in range(len(dt_annos)):
-        # parse detected annotations
-        det_anno = dt_annos[img_id]
-        for i in range(len(det_anno['labels_3d'])):
-            label = det_anno['labels_3d'].numpy()[i]
-            bbox = det_anno['boxes_3d'].convert_to(box_mode_3d)[i]
-            score = det_anno['scores_3d'].numpy()[i]
-            if label not in pred:
-                pred[int(label)] = {}
-            if img_id not in pred[label]:
-                pred[int(label)][img_id] = []
-            if label not in gt:
-                gt[int(label)] = {}
-            if img_id not in gt[label]:
-                gt[int(label)][img_id] = []
-            pred[int(label)][img_id].append((bbox, score))
-
-        # parse gt annotations
-        gt_anno = gt_annos[img_id]
-        if gt_anno['gt_num'] != 0:
-            gt_boxes = box_type_3d(
-                gt_anno['gt_boxes_upright_depth'],
-                box_dim=gt_anno['gt_boxes_upright_depth'].shape[-1],
-                origin=(0.5, 0.5, 0.5)).convert_to(box_mode_3d)
-            labels_3d = gt_anno['class']
-        else:
-            gt_boxes = box_type_3d(np.array([], dtype=np.float32))
-            labels_3d = np.array([], dtype=np.int64)
-
-        for i in range(len(labels_3d)):
-            label = labels_3d[i]
-            bbox = gt_boxes[i]
-            if label not in gt:
-                gt[label] = {}
-            if img_id not in gt[label]:
-                gt[label][img_id] = []
-            gt[label][img_id].append(bbox)
-
-    rec, prec, ap = eval_map_recall(pred, gt, metric)
-    ret_dict = dict()
-    header = ['classes']
-    table_columns = [[label2cat[label]
-                      for label in ap[0].keys()] + ['Overall']]
-
-    for i, iou_thresh in enumerate(metric):
-        header.append(f'AP_{iou_thresh:.2f}')
-        header.append(f'AR_{iou_thresh:.2f}')
-        rec_list = []
-        for label in ap[i].keys():
-            ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
-                ap[i][label][0])
-        ret_dict[f'mAP_{iou_thresh:.2f}'] = float(
-            np.mean(list(ap[i].values())))
-
-        table_columns.append(list(map(float, list(ap[i].values()))))
-        table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']]
-        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
-
-        for label in rec[i].keys():
-            ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
-                rec[i][label][-1])
-            rec_list.append(rec[i][label][-1])
-        ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))
-
-        table_columns.append(list(map(float, rec_list)))
-        table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']]
-        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
-
-    table_data = [header]
-    table_rows = list(zip(*table_columns))
-    table_data += table_rows
-    table = AsciiTable(table_data)
-    table.inner_footing_row_border = True
-    print_log('\n' + table.table, logger=logger)
-
-    return ret_dict
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (np.ndarray): Recalls with shape of (num_scales, num_dets)
+            or (num_dets, ).
+        precisions (np.ndarray): Precisions with shape of
+            (num_scales, num_dets) or (num_dets, ).
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or np.ndarray: Calculated average precision.
+    """
+    if recalls.ndim == 1:
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+
+    assert recalls.shape == precisions.shape
+    assert recalls.ndim == 2
+
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+            ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    return ap
+
+
+def eval_det_cls(pred, gt, iou_thr=None):
+    """Generic functions to compute precision/recall for object detection for a
+    single class.
+
+    Args:
+        pred (dict): Predictions mapping from image id to bounding boxes
+            and scores.
+        gt (dict): Ground truths mapping from image id to bounding boxes.
+        iou_thr (list[float]): A list of iou thresholds.
+
+    Return:
+        tuple (np.ndarray, np.ndarray, float): Recalls, precisions and
+            average precision.
+    """
+
+    # {img_id: {'bbox': box structure, 'det': matched list}}
+    class_recs = {}
+    npos = 0
+    for img_id in gt.keys():
+        cur_gt_num = len(gt[img_id])
+        if cur_gt_num != 0:
+            gt_cur = torch.zeros([cur_gt_num, 7], dtype=torch.float32)
+            for i in range(cur_gt_num):
+                gt_cur[i] = gt[img_id][i].tensor
+            bbox = gt[img_id][0].new_box(gt_cur)
+        else:
+            bbox = gt[img_id]
+        det = [[False] * len(bbox) for i in iou_thr]
+        npos += len(bbox)
+        class_recs[img_id] = {'bbox': bbox, 'det': det}
+
+    # construct dets
+    image_ids = []
+    confidence = []
+    ious = []
+    for img_id in pred.keys():
+        cur_num = len(pred[img_id])
+        if cur_num == 0:
+            continue
+        pred_cur = torch.zeros((cur_num, 7), dtype=torch.float32)
+        box_idx = 0
+        for box, score in pred[img_id]:
+            image_ids.append(img_id)
+            confidence.append(score)
+            pred_cur[box_idx] = box.tensor
+            box_idx += 1
+        pred_cur = box.new_box(pred_cur)
+        gt_cur = class_recs[img_id]['bbox']
+        if len(gt_cur) > 0:
+            # calculate iou in each image
+            iou_cur = pred_cur.overlaps(pred_cur, gt_cur)
+            for i in range(cur_num):
+                ious.append(iou_cur[i])
+        else:
+            for i in range(cur_num):
+                ious.append(np.zeros(1))
+
+    confidence = np.array(confidence)
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    image_ids = [image_ids[x] for x in sorted_ind]
+    ious = [ious[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp_thr = [np.zeros(nd) for i in iou_thr]
+    fp_thr = [np.zeros(nd) for i in iou_thr]
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        iou_max = -np.inf
+        BBGT = R['bbox']
+        cur_iou = ious[d]
+
+        if len(BBGT) > 0:
+            # compute overlaps
+            for j in range(len(BBGT)):
+                # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
+                iou = cur_iou[j]
+                if iou > iou_max:
+                    iou_max = iou
+                    jmax = j
+
+        for iou_idx, thresh in enumerate(iou_thr):
+            if iou_max > thresh:
+                if not R['det'][iou_idx][jmax]:
+                    tp_thr[iou_idx][d] = 1.
+                    R['det'][iou_idx][jmax] = 1
+                else:
+                    fp_thr[iou_idx][d] = 1.
+            else:
+                fp_thr[iou_idx][d] = 1.
+
+    ret = []
+    for iou_idx, thresh in enumerate(iou_thr):
+        # compute precision recall
+        fp = np.cumsum(fp_thr[iou_idx])
+        tp = np.cumsum(tp_thr[iou_idx])
+        recall = tp / float(npos)
+        # avoid divide by zero in case the first detection matches a difficult
+        # ground truth
+        precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+        ap = average_precision(recall, precision)
+        ret.append((recall, precision, ap))
+
+    return ret
+
+
+def eval_map_recall(pred, gt, ovthresh=None):
+    """Evaluate mAP and recall.
+
+    Generic functions to compute precision/recall for object detection
+        for multiple classes.
+
+    Args:
+        pred (dict): Information of detection results,
+            which maps class_id and predictions.
+        gt (dict): Information of ground truths, which maps class_id and
+            ground truths.
+        ovthresh (list[float], optional): iou threshold. Default: None.
+
+    Return:
+        tuple[dict]: dict results of recall, AP, and precision for all classes.
+    """
+
+    ret_values = {}
+    for classname in gt.keys():
+        if classname in pred:
+            ret_values[classname] = eval_det_cls(pred[classname],
+                                                 gt[classname], ovthresh)
+    recall = [{} for i in ovthresh]
+    precision = [{} for i in ovthresh]
+    ap = [{} for i in ovthresh]
+
+    for label in gt.keys():
+        for iou_idx, thresh in enumerate(ovthresh):
+            if label in pred:
+                recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][
+                    label] = ret_values[label][iou_idx]
+            else:
+                recall[iou_idx][label] = np.zeros(1)
+                precision[iou_idx][label] = np.zeros(1)
+                ap[iou_idx][label] = np.zeros(1)
+
+    return recall, precision, ap
+
+
+def indoor_eval(gt_annos,
+                dt_annos,
+                metric,
+                label2cat,
+                logger=None,
+                box_type_3d=None,
+                box_mode_3d=None):
+    """Indoor Evaluation.
+
+    Evaluate the result of the detection.
+
+    Args:
+        gt_annos (list[dict]): Ground truth annotations.
+        dt_annos (list[dict]): Detection annotations. the dict
+            includes the following keys
+
+            - labels_3d (torch.Tensor): Labels of boxes.
+            - boxes_3d (:obj:`BaseInstance3DBoxes`):
+                3D bounding boxes in Depth coordinate.
+            - scores_3d (torch.Tensor): Scores of boxes.
+        metric (list[float]): IoU thresholds for computing average precisions.
+        label2cat (dict): Map from label to category.
+        logger (logging.Logger | str, optional): The way to print the mAP
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+
+    Return:
+        dict[str, float]: Dict of results.
+    """
+    assert len(dt_annos) == len(gt_annos)
+    pred = {}  # map {class_id: pred}
+    gt = {}  # map {class_id: gt}
+    for img_id in range(len(dt_annos)):
+        # parse detected annotations
+        det_anno = dt_annos[img_id]
+        for i in range(len(det_anno['labels_3d'])):
+            label = det_anno['labels_3d'].numpy()[i]
+            bbox = det_anno['boxes_3d'].convert_to(box_mode_3d)[i]
+            score = det_anno['scores_3d'].numpy()[i]
+            if label not in pred:
+                pred[int(label)] = {}
+            if img_id not in pred[label]:
+                pred[int(label)][img_id] = []
+            if label not in gt:
+                gt[int(label)] = {}
+            if img_id not in gt[label]:
+                gt[int(label)][img_id] = []
+            pred[int(label)][img_id].append((bbox, score))
+
+        # parse gt annotations
+        gt_anno = gt_annos[img_id]
+        if gt_anno['gt_num'] != 0:
+            gt_boxes = box_type_3d(
+                gt_anno['gt_boxes_upright_depth'],
+                box_dim=gt_anno['gt_boxes_upright_depth'].shape[-1],
+                origin=(0.5, 0.5, 0.5)).convert_to(box_mode_3d)
+            labels_3d = gt_anno['class']
+        else:
+            gt_boxes = box_type_3d(np.array([], dtype=np.float32))
+            labels_3d = np.array([], dtype=np.int64)
+
+        for i in range(len(labels_3d)):
+            label = labels_3d[i]
+            bbox = gt_boxes[i]
+            if label not in gt:
+                gt[label] = {}
+            if img_id not in gt[label]:
+                gt[label][img_id] = []
+            gt[label][img_id].append(bbox)
+
+    rec, prec, ap = eval_map_recall(pred, gt, metric)
+    ret_dict = dict()
+    header = ['classes']
+    table_columns = [[label2cat[label]
+                      for label in ap[0].keys()] + ['Overall']]
+
+    for i, iou_thresh in enumerate(metric):
+        header.append(f'AP_{iou_thresh:.2f}')
+        header.append(f'AR_{iou_thresh:.2f}')
+        rec_list = []
+        for label in ap[i].keys():
+            ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
+                ap[i][label][0])
+        ret_dict[f'mAP_{iou_thresh:.2f}'] = float(
+            np.mean(list(ap[i].values())))
+
+        table_columns.append(list(map(float, list(ap[i].values()))))
+        table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']]
+        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+        for label in rec[i].keys():
+            ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
+                rec[i][label][-1])
+            rec_list.append(rec[i][label][-1])
+        ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))
+
+        table_columns.append(list(map(float, rec_list)))
+        table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']]
+        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+    table_data = [header]
+    table_rows = list(zip(*table_columns))
+    table_data += table_rows
+    table = AsciiTable(table_data)
+    table.inner_footing_row_border = True
+    print_log('\n' + table.table, logger=logger)
+
+    return ret_dict
diff --git a/mmdet3d/core/evaluation/instance_seg_eval.py b/mmdet3d/core/evaluation/instance_seg_eval.py
index 31f5110..87b7efe 100644
--- a/mmdet3d/core/evaluation/instance_seg_eval.py
+++ b/mmdet3d/core/evaluation/instance_seg_eval.py
@@ -1,128 +1,128 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-from mmcv.utils import print_log
-from terminaltables import AsciiTable
-
-from .scannet_utils.evaluate_semantic_instance import scannet_eval
-
-
-def aggregate_predictions(masks, labels, scores, valid_class_ids):
-    """Maps predictions to ScanNet evaluator format.
-
-    Args:
-        masks (list[torch.Tensor]): Per scene predicted instance masks.
-        labels (list[torch.Tensor]): Per scene predicted instance labels.
-        scores (list[torch.Tensor]): Per scene predicted instance scores.
-        valid_class_ids (tuple[int]): Ids of valid categories.
-
-    Returns:
-        list[dict]: Per scene aggregated predictions.
-    """
-    infos = []
-    for id, (mask, label, score) in enumerate(zip(masks, labels, scores)):
-        mask = mask.clone().numpy()
-        label = label.clone().numpy()
-        score = score.clone().numpy()
-        info = dict()
-        n_instances = mask.max() + 1
-        for i in range(n_instances):
-            # match pred_instance['filename'] from assign_instances_for_scan
-            file_name = f'{id}_{i}'
-            info[file_name] = dict()
-            info[file_name]['mask'] = (mask == i).astype(np.int)
-            info[file_name]['label_id'] = valid_class_ids[label[i]]
-            info[file_name]['conf'] = score[i]
-        infos.append(info)
-    return infos
-
-
-def rename_gt(gt_semantic_masks, gt_instance_masks, valid_class_ids):
-    """Maps gt instance and semantic masks to instance masks for ScanNet
-    evaluator.
-
-    Args:
-        gt_semantic_masks (list[torch.Tensor]): Per scene gt semantic masks.
-        gt_instance_masks (list[torch.Tensor]): Per scene gt instance masks.
-        valid_class_ids (tuple[int]): Ids of valid categories.
-
-    Returns:
-        list[np.array]: Per scene instance masks.
-    """
-    renamed_instance_masks = []
-    for semantic_mask, instance_mask in zip(gt_semantic_masks,
-                                            gt_instance_masks):
-        semantic_mask = semantic_mask.clone().numpy()
-        instance_mask = instance_mask.clone().numpy()
-        unique = np.unique(instance_mask)
-        assert len(unique) < 1000
-        for i in unique:
-            semantic_instance = semantic_mask[instance_mask == i]
-            semantic_unique = np.unique(semantic_instance)
-            assert len(semantic_unique) == 1
-            if semantic_unique[0] < len(valid_class_ids):
-                instance_mask[
-                    instance_mask ==
-                    i] = 1000 * valid_class_ids[semantic_unique[0]] + i
-        renamed_instance_masks.append(instance_mask)
-    return renamed_instance_masks
-
-
-def instance_seg_eval(gt_semantic_masks,
-                      gt_instance_masks,
-                      pred_instance_masks,
-                      pred_instance_labels,
-                      pred_instance_scores,
-                      valid_class_ids,
-                      class_labels,
-                      options=None,
-                      logger=None):
-    """Instance Segmentation Evaluation.
-
-    Evaluate the result of the instance segmentation.
-
-    Args:
-        gt_semantic_masks (list[torch.Tensor]): Ground truth semantic masks.
-        gt_instance_masks (list[torch.Tensor]): Ground truth instance masks.
-        pred_instance_masks (list[torch.Tensor]): Predicted instance masks.
-        pred_instance_labels (list[torch.Tensor]): Predicted instance labels.
-        pred_instance_scores (list[torch.Tensor]): Predicted instance labels.
-        valid_class_ids (tuple[int]): Ids of valid categories.
-        class_labels (tuple[str]): Names of valid categories.
-        options (dict, optional): Additional options. Keys may contain:
-            `overlaps`, `min_region_sizes`, `distance_threshes`,
-            `distance_confs`. Default: None.
-        logger (logging.Logger | str, optional): The way to print the mAP
-            summary. See `mmdet.utils.print_log()` for details. Default: None.
-
-    Returns:
-        dict[str, float]: Dict of results.
-    """
-    assert len(valid_class_ids) == len(class_labels)
-    id_to_label = {
-        valid_class_ids[i]: class_labels[i]
-        for i in range(len(valid_class_ids))
-    }
-    preds = aggregate_predictions(
-        masks=pred_instance_masks,
-        labels=pred_instance_labels,
-        scores=pred_instance_scores,
-        valid_class_ids=valid_class_ids)
-    gts = rename_gt(gt_semantic_masks, gt_instance_masks, valid_class_ids)
-    metrics = scannet_eval(
-        preds=preds,
-        gts=gts,
-        options=options,
-        valid_class_ids=valid_class_ids,
-        class_labels=class_labels,
-        id_to_label=id_to_label)
-    header = ['classes', 'AP_0.25', 'AP_0.50', 'AP']
-    rows = []
-    for label, data in metrics['classes'].items():
-        aps = [data['ap25%'], data['ap50%'], data['ap']]
-        rows.append([label] + [f'{ap:.4f}' for ap in aps])
-    aps = metrics['all_ap_25%'], metrics['all_ap_50%'], metrics['all_ap']
-    footer = ['Overall'] + [f'{ap:.4f}' for ap in aps]
-    table = AsciiTable([header] + rows + [footer])
-    table.inner_footing_row_border = True
-    print_log('\n' + table.table, logger=logger)
-    return metrics
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from .scannet_utils.evaluate_semantic_instance import scannet_eval
+
+
+def aggregate_predictions(masks, labels, scores, valid_class_ids):
+    """Maps predictions to ScanNet evaluator format.
+
+    Args:
+        masks (list[torch.Tensor]): Per scene predicted instance masks.
+        labels (list[torch.Tensor]): Per scene predicted instance labels.
+        scores (list[torch.Tensor]): Per scene predicted instance scores.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+
+    Returns:
+        list[dict]: Per scene aggregated predictions.
+    """
+    infos = []
+    for id, (mask, label, score) in enumerate(zip(masks, labels, scores)):
+        mask = mask.clone().numpy()
+        label = label.clone().numpy()
+        score = score.clone().numpy()
+        info = dict()
+        n_instances = mask.max() + 1
+        for i in range(n_instances):
+            # match pred_instance['filename'] from assign_instances_for_scan
+            file_name = f'{id}_{i}'
+            info[file_name] = dict()
+            info[file_name]['mask'] = (mask == i).astype(np.int)
+            info[file_name]['label_id'] = valid_class_ids[label[i]]
+            info[file_name]['conf'] = score[i]
+        infos.append(info)
+    return infos
+
+
+def rename_gt(gt_semantic_masks, gt_instance_masks, valid_class_ids):
+    """Maps gt instance and semantic masks to instance masks for ScanNet
+    evaluator.
+
+    Args:
+        gt_semantic_masks (list[torch.Tensor]): Per scene gt semantic masks.
+        gt_instance_masks (list[torch.Tensor]): Per scene gt instance masks.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+
+    Returns:
+        list[np.array]: Per scene instance masks.
+    """
+    renamed_instance_masks = []
+    for semantic_mask, instance_mask in zip(gt_semantic_masks,
+                                            gt_instance_masks):
+        semantic_mask = semantic_mask.clone().numpy()
+        instance_mask = instance_mask.clone().numpy()
+        unique = np.unique(instance_mask)
+        assert len(unique) < 1000
+        for i in unique:
+            semantic_instance = semantic_mask[instance_mask == i]
+            semantic_unique = np.unique(semantic_instance)
+            assert len(semantic_unique) == 1
+            if semantic_unique[0] < len(valid_class_ids):
+                instance_mask[
+                    instance_mask ==
+                    i] = 1000 * valid_class_ids[semantic_unique[0]] + i
+        renamed_instance_masks.append(instance_mask)
+    return renamed_instance_masks
+
+
+def instance_seg_eval(gt_semantic_masks,
+                      gt_instance_masks,
+                      pred_instance_masks,
+                      pred_instance_labels,
+                      pred_instance_scores,
+                      valid_class_ids,
+                      class_labels,
+                      options=None,
+                      logger=None):
+    """Instance Segmentation Evaluation.
+
+    Evaluate the result of the instance segmentation.
+
+    Args:
+        gt_semantic_masks (list[torch.Tensor]): Ground truth semantic masks.
+        gt_instance_masks (list[torch.Tensor]): Ground truth instance masks.
+        pred_instance_masks (list[torch.Tensor]): Predicted instance masks.
+        pred_instance_labels (list[torch.Tensor]): Predicted instance labels.
+        pred_instance_scores (list[torch.Tensor]): Predicted instance labels.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+        class_labels (tuple[str]): Names of valid categories.
+        options (dict, optional): Additional options. Keys may contain:
+            `overlaps`, `min_region_sizes`, `distance_threshes`,
+            `distance_confs`. Default: None.
+        logger (logging.Logger | str, optional): The way to print the mAP
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+
+    Returns:
+        dict[str, float]: Dict of results.
+    """
+    assert len(valid_class_ids) == len(class_labels)
+    id_to_label = {
+        valid_class_ids[i]: class_labels[i]
+        for i in range(len(valid_class_ids))
+    }
+    preds = aggregate_predictions(
+        masks=pred_instance_masks,
+        labels=pred_instance_labels,
+        scores=pred_instance_scores,
+        valid_class_ids=valid_class_ids)
+    gts = rename_gt(gt_semantic_masks, gt_instance_masks, valid_class_ids)
+    metrics = scannet_eval(
+        preds=preds,
+        gts=gts,
+        options=options,
+        valid_class_ids=valid_class_ids,
+        class_labels=class_labels,
+        id_to_label=id_to_label)
+    header = ['classes', 'AP_0.25', 'AP_0.50', 'AP']
+    rows = []
+    for label, data in metrics['classes'].items():
+        aps = [data['ap25%'], data['ap50%'], data['ap']]
+        rows.append([label] + [f'{ap:.4f}' for ap in aps])
+    aps = metrics['all_ap_25%'], metrics['all_ap_50%'], metrics['all_ap']
+    footer = ['Overall'] + [f'{ap:.4f}' for ap in aps]
+    table = AsciiTable([header] + rows + [footer])
+    table.inner_footing_row_border = True
+    print_log('\n' + table.table, logger=logger)
+    return metrics
diff --git a/mmdet3d/core/evaluation/kitti_utils/__init__.py b/mmdet3d/core/evaluation/kitti_utils/__init__.py
index 23c1cdf..3d91126 100644
--- a/mmdet3d/core/evaluation/kitti_utils/__init__.py
+++ b/mmdet3d/core/evaluation/kitti_utils/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .eval import kitti_eval, kitti_eval_coco_style
-
-__all__ = ['kitti_eval', 'kitti_eval_coco_style']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .eval import kitti_eval, kitti_eval_coco_style
+
+__all__ = ['kitti_eval', 'kitti_eval_coco_style']
diff --git a/mmdet3d/core/evaluation/kitti_utils/eval.py b/mmdet3d/core/evaluation/kitti_utils/eval.py
index f8408df..b42d92b 100644
--- a/mmdet3d/core/evaluation/kitti_utils/eval.py
+++ b/mmdet3d/core/evaluation/kitti_utils/eval.py
@@ -1,950 +1,950 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import gc
-import io as sysio
-
-import numba
-import numpy as np
-
-
-@numba.jit
-def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41):
-    scores.sort()
-    scores = scores[::-1]
-    current_recall = 0
-    thresholds = []
-    for i, score in enumerate(scores):
-        l_recall = (i + 1) / num_gt
-        if i < (len(scores) - 1):
-            r_recall = (i + 2) / num_gt
-        else:
-            r_recall = l_recall
-        if (((r_recall - current_recall) < (current_recall - l_recall))
-                and (i < (len(scores) - 1))):
-            continue
-        # recall = l_recall
-        thresholds.append(score)
-        current_recall += 1 / (num_sample_pts - 1.0)
-    return thresholds
-
-
-def clean_data(gt_anno, dt_anno, current_class, difficulty):
-    CLASS_NAMES = ['car', 'pedestrian', 'cyclist']
-    MIN_HEIGHT = [40, 25, 25]
-    MAX_OCCLUSION = [0, 1, 2]
-    MAX_TRUNCATION = [0.15, 0.3, 0.5]
-    dc_bboxes, ignored_gt, ignored_dt = [], [], []
-    current_cls_name = CLASS_NAMES[current_class].lower()
-    num_gt = len(gt_anno['name'])
-    num_dt = len(dt_anno['name'])
-    num_valid_gt = 0
-    for i in range(num_gt):
-        bbox = gt_anno['bbox'][i]
-        gt_name = gt_anno['name'][i].lower()
-        height = bbox[3] - bbox[1]
-        valid_class = -1
-        if (gt_name == current_cls_name):
-            valid_class = 1
-        elif (current_cls_name == 'Pedestrian'.lower()
-              and 'Person_sitting'.lower() == gt_name):
-            valid_class = 0
-        elif (current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name):
-            valid_class = 0
-        else:
-            valid_class = -1
-        ignore = False
-        if ((gt_anno['occluded'][i] > MAX_OCCLUSION[difficulty])
-                or (gt_anno['truncated'][i] > MAX_TRUNCATION[difficulty])
-                or (height <= MIN_HEIGHT[difficulty])):
-            ignore = True
-        if valid_class == 1 and not ignore:
-            ignored_gt.append(0)
-            num_valid_gt += 1
-        elif (valid_class == 0 or (ignore and (valid_class == 1))):
-            ignored_gt.append(1)
-        else:
-            ignored_gt.append(-1)
-    # for i in range(num_gt):
-        if gt_anno['name'][i] == 'DontCare':
-            dc_bboxes.append(gt_anno['bbox'][i])
-    for i in range(num_dt):
-        if (dt_anno['name'][i].lower() == current_cls_name):
-            valid_class = 1
-        else:
-            valid_class = -1
-        height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1])
-        if height < MIN_HEIGHT[difficulty]:
-            ignored_dt.append(1)
-        elif valid_class == 1:
-            ignored_dt.append(0)
-        else:
-            ignored_dt.append(-1)
-
-    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
-
-
-@numba.jit(nopython=True)
-def image_box_overlap(boxes, query_boxes, criterion=-1):
-    N = boxes.shape[0]
-    K = query_boxes.shape[0]
-    overlaps = np.zeros((N, K), dtype=boxes.dtype)
-    for k in range(K):
-        qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) *
-                     (query_boxes[k, 3] - query_boxes[k, 1]))
-        for n in range(N):
-            iw = (
-                min(boxes[n, 2], query_boxes[k, 2]) -
-                max(boxes[n, 0], query_boxes[k, 0]))
-            if iw > 0:
-                ih = (
-                    min(boxes[n, 3], query_boxes[k, 3]) -
-                    max(boxes[n, 1], query_boxes[k, 1]))
-                if ih > 0:
-                    if criterion == -1:
-                        ua = ((boxes[n, 2] - boxes[n, 0]) *
-                              (boxes[n, 3] - boxes[n, 1]) + qbox_area -
-                              iw * ih)
-                    elif criterion == 0:
-                        ua = ((boxes[n, 2] - boxes[n, 0]) *
-                              (boxes[n, 3] - boxes[n, 1]))
-                    elif criterion == 1:
-                        ua = qbox_area
-                    else:
-                        ua = 1.0
-                    overlaps[n, k] = iw * ih / ua
-    return overlaps
-
-
-def bev_box_overlap(boxes, qboxes, criterion=-1):
-    from .rotate_iou import rotate_iou_gpu_eval
-    riou = rotate_iou_gpu_eval(boxes, qboxes, criterion)
-    return riou
-
-
-@numba.jit(nopython=True, parallel=True)
-def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1):
-    # ONLY support overlap in CAMERA, not lidar.
-    # TODO: change to use prange for parallel mode, should check the difference
-    N, K = boxes.shape[0], qboxes.shape[0]
-    for i in numba.prange(N):
-        for j in numba.prange(K):
-            if rinc[i, j] > 0:
-                # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] +
-                #         qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1]))
-                iw = (
-                    min(boxes[i, 1], qboxes[j, 1]) -
-                    max(boxes[i, 1] - boxes[i, 4],
-                        qboxes[j, 1] - qboxes[j, 4]))
-
-                if iw > 0:
-                    area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5]
-                    area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5]
-                    inc = iw * rinc[i, j]
-                    if criterion == -1:
-                        ua = (area1 + area2 - inc)
-                    elif criterion == 0:
-                        ua = area1
-                    elif criterion == 1:
-                        ua = area2
-                    else:
-                        ua = inc
-                    rinc[i, j] = inc / ua
-                else:
-                    rinc[i, j] = 0.0
-
-
-def d3_box_overlap(boxes, qboxes, criterion=-1):
-    from .rotate_iou import rotate_iou_gpu_eval
-    rinc = rotate_iou_gpu_eval(boxes[:, [0, 2, 3, 5, 6]],
-                               qboxes[:, [0, 2, 3, 5, 6]], 2)
-    d3_box_overlap_kernel(boxes, qboxes, rinc, criterion)
-    return rinc
-
-
-@numba.jit(nopython=True)
-def compute_statistics_jit(overlaps,
-                           gt_datas,
-                           dt_datas,
-                           ignored_gt,
-                           ignored_det,
-                           dc_bboxes,
-                           metric,
-                           min_overlap,
-                           thresh=0,
-                           compute_fp=False,
-                           compute_aos=False):
-
-    det_size = dt_datas.shape[0]
-    gt_size = gt_datas.shape[0]
-    dt_scores = dt_datas[:, -1]
-    dt_alphas = dt_datas[:, 4]
-    gt_alphas = gt_datas[:, 4]
-    dt_bboxes = dt_datas[:, :4]
-    # gt_bboxes = gt_datas[:, :4]
-
-    assigned_detection = [False] * det_size
-    ignored_threshold = [False] * det_size
-    if compute_fp:
-        for i in range(det_size):
-            if (dt_scores[i] < thresh):
-                ignored_threshold[i] = True
-    NO_DETECTION = -10000000
-    tp, fp, fn, similarity = 0, 0, 0, 0
-    # thresholds = [0.0]
-    # delta = [0.0]
-    thresholds = np.zeros((gt_size, ))
-    thresh_idx = 0
-    delta = np.zeros((gt_size, ))
-    delta_idx = 0
-    for i in range(gt_size):
-        if ignored_gt[i] == -1:
-            continue
-        det_idx = -1
-        valid_detection = NO_DETECTION
-        max_overlap = 0
-        assigned_ignored_det = False
-
-        for j in range(det_size):
-            if (ignored_det[j] == -1):
-                continue
-            if (assigned_detection[j]):
-                continue
-            if (ignored_threshold[j]):
-                continue
-            overlap = overlaps[j, i]
-            dt_score = dt_scores[j]
-            if (not compute_fp and (overlap > min_overlap)
-                    and dt_score > valid_detection):
-                det_idx = j
-                valid_detection = dt_score
-            elif (compute_fp and (overlap > min_overlap)
-                  and (overlap > max_overlap or assigned_ignored_det)
-                  and ignored_det[j] == 0):
-                max_overlap = overlap
-                det_idx = j
-                valid_detection = 1
-                assigned_ignored_det = False
-            elif (compute_fp and (overlap > min_overlap)
-                  and (valid_detection == NO_DETECTION)
-                  and ignored_det[j] == 1):
-                det_idx = j
-                valid_detection = 1
-                assigned_ignored_det = True
-
-        if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0:
-            fn += 1
-        elif ((valid_detection != NO_DETECTION)
-              and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)):
-            assigned_detection[det_idx] = True
-        elif valid_detection != NO_DETECTION:
-            tp += 1
-            # thresholds.append(dt_scores[det_idx])
-            thresholds[thresh_idx] = dt_scores[det_idx]
-            thresh_idx += 1
-            if compute_aos:
-                # delta.append(gt_alphas[i] - dt_alphas[det_idx])
-                delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx]
-                delta_idx += 1
-
-            assigned_detection[det_idx] = True
-    if compute_fp:
-        for i in range(det_size):
-            if (not (assigned_detection[i] or ignored_det[i] == -1
-                     or ignored_det[i] == 1 or ignored_threshold[i])):
-                fp += 1
-        nstuff = 0
-        if metric == 0:
-            overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0)
-            for i in range(dc_bboxes.shape[0]):
-                for j in range(det_size):
-                    if (assigned_detection[j]):
-                        continue
-                    if (ignored_det[j] == -1 or ignored_det[j] == 1):
-                        continue
-                    if (ignored_threshold[j]):
-                        continue
-                    if overlaps_dt_dc[j, i] > min_overlap:
-                        assigned_detection[j] = True
-                        nstuff += 1
-        fp -= nstuff
-        if compute_aos:
-            tmp = np.zeros((fp + delta_idx, ))
-            # tmp = [0] * fp
-            for i in range(delta_idx):
-                tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0
-                # tmp.append((1.0 + np.cos(delta[i])) / 2.0)
-            # assert len(tmp) == fp + tp
-            # assert len(delta) == tp
-            if tp > 0 or fp > 0:
-                similarity = np.sum(tmp)
-            else:
-                similarity = -1
-    return tp, fp, fn, similarity, thresholds[:thresh_idx]
-
-
-def get_split_parts(num, num_part):
-    same_part = num // num_part
-    remain_num = num % num_part
-    if remain_num == 0:
-        return [same_part] * num_part
-    else:
-        return [same_part] * num_part + [remain_num]
-
-
-@numba.jit(nopython=True)
-def fused_compute_statistics(overlaps,
-                             pr,
-                             gt_nums,
-                             dt_nums,
-                             dc_nums,
-                             gt_datas,
-                             dt_datas,
-                             dontcares,
-                             ignored_gts,
-                             ignored_dets,
-                             metric,
-                             min_overlap,
-                             thresholds,
-                             compute_aos=False):
-    gt_num = 0
-    dt_num = 0
-    dc_num = 0
-    for i in range(gt_nums.shape[0]):
-        for t, thresh in enumerate(thresholds):
-            overlap = overlaps[dt_num:dt_num + dt_nums[i],
-                               gt_num:gt_num + gt_nums[i]]
-
-            gt_data = gt_datas[gt_num:gt_num + gt_nums[i]]
-            dt_data = dt_datas[dt_num:dt_num + dt_nums[i]]
-            ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]]
-            ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]]
-            dontcare = dontcares[dc_num:dc_num + dc_nums[i]]
-            tp, fp, fn, similarity, _ = compute_statistics_jit(
-                overlap,
-                gt_data,
-                dt_data,
-                ignored_gt,
-                ignored_det,
-                dontcare,
-                metric,
-                min_overlap=min_overlap,
-                thresh=thresh,
-                compute_fp=True,
-                compute_aos=compute_aos)
-            pr[t, 0] += tp
-            pr[t, 1] += fp
-            pr[t, 2] += fn
-            if similarity != -1:
-                pr[t, 3] += similarity
-        gt_num += gt_nums[i]
-        dt_num += dt_nums[i]
-        dc_num += dc_nums[i]
-
-
-def calculate_iou_partly(gt_annos, dt_annos, metric, num_parts=50):
-    """Fast iou algorithm. this function can be used independently to do result
-    analysis. Must be used in CAMERA coordinate system.
-
-    Args:
-        gt_annos (dict): Must from get_label_annos() in kitti_common.py.
-        dt_annos (dict): Must from get_label_annos() in kitti_common.py.
-        metric (int): Eval type. 0: bbox, 1: bev, 2: 3d.
-        num_parts (int): A parameter for fast calculate algorithm.
-    """
-    assert len(gt_annos) == len(dt_annos)
-    total_dt_num = np.stack([len(a['name']) for a in dt_annos], 0)
-    total_gt_num = np.stack([len(a['name']) for a in gt_annos], 0)
-    num_examples = len(gt_annos)
-    split_parts = get_split_parts(num_examples, num_parts)
-    parted_overlaps = []
-    example_idx = 0
-
-    for num_part in split_parts:
-        gt_annos_part = gt_annos[example_idx:example_idx + num_part]
-        dt_annos_part = dt_annos[example_idx:example_idx + num_part]
-        if metric == 0:
-            gt_boxes = np.concatenate([a['bbox'] for a in gt_annos_part], 0)
-            dt_boxes = np.concatenate([a['bbox'] for a in dt_annos_part], 0)
-            overlap_part = image_box_overlap(gt_boxes, dt_boxes)
-        elif metric == 1:
-            loc = np.concatenate(
-                [a['location'][:, [0, 2]] for a in gt_annos_part], 0)
-            dims = np.concatenate(
-                [a['dimensions'][:, [0, 2]] for a in gt_annos_part], 0)
-            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
-            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
-                                      axis=1)
-            loc = np.concatenate(
-                [a['location'][:, [0, 2]] for a in dt_annos_part], 0)
-            dims = np.concatenate(
-                [a['dimensions'][:, [0, 2]] for a in dt_annos_part], 0)
-            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
-            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
-                                      axis=1)
-            overlap_part = bev_box_overlap(gt_boxes,
-                                           dt_boxes).astype(np.float64)
-        elif metric == 2:
-            loc = np.concatenate([a['location'] for a in gt_annos_part], 0)
-            dims = np.concatenate([a['dimensions'] for a in gt_annos_part], 0)
-            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
-            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
-                                      axis=1)
-            loc = np.concatenate([a['location'] for a in dt_annos_part], 0)
-            dims = np.concatenate([a['dimensions'] for a in dt_annos_part], 0)
-            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
-            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
-                                      axis=1)
-            overlap_part = d3_box_overlap(gt_boxes,
-                                          dt_boxes).astype(np.float64)
-        else:
-            raise ValueError('unknown metric')
-        parted_overlaps.append(overlap_part)
-        example_idx += num_part
-    overlaps = []
-    example_idx = 0
-    for j, num_part in enumerate(split_parts):
-        gt_annos_part = gt_annos[example_idx:example_idx + num_part]
-        dt_annos_part = dt_annos[example_idx:example_idx + num_part]
-        gt_num_idx, dt_num_idx = 0, 0
-        for i in range(num_part):
-            gt_box_num = total_gt_num[example_idx + i]
-            dt_box_num = total_dt_num[example_idx + i]
-            overlaps.append(
-                parted_overlaps[j][gt_num_idx:gt_num_idx + gt_box_num,
-                                   dt_num_idx:dt_num_idx + dt_box_num])
-            gt_num_idx += gt_box_num
-            dt_num_idx += dt_box_num
-        example_idx += num_part
-
-    return overlaps, parted_overlaps, total_gt_num, total_dt_num
-
-
-def _prepare_data(gt_annos, dt_annos, current_class, difficulty):
-    gt_datas_list = []
-    dt_datas_list = []
-    total_dc_num = []
-    ignored_gts, ignored_dets, dontcares = [], [], []
-    total_num_valid_gt = 0
-    for i in range(len(gt_annos)):
-        rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty)
-        num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets
-        ignored_gts.append(np.array(ignored_gt, dtype=np.int64))
-        ignored_dets.append(np.array(ignored_det, dtype=np.int64))
-        if len(dc_bboxes) == 0:
-            dc_bboxes = np.zeros((0, 4)).astype(np.float64)
-        else:
-            dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64)
-        total_dc_num.append(dc_bboxes.shape[0])
-        dontcares.append(dc_bboxes)
-        total_num_valid_gt += num_valid_gt
-        gt_datas = np.concatenate(
-            [gt_annos[i]['bbox'], gt_annos[i]['alpha'][..., np.newaxis]], 1)
-        dt_datas = np.concatenate([
-            dt_annos[i]['bbox'], dt_annos[i]['alpha'][..., np.newaxis],
-            dt_annos[i]['score'][..., np.newaxis]
-        ], 1)
-        gt_datas_list.append(gt_datas)
-        dt_datas_list.append(dt_datas)
-    total_dc_num = np.stack(total_dc_num, axis=0)
-    return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares,
-            total_dc_num, total_num_valid_gt)
-
-
-def eval_class(gt_annos,
-               dt_annos,
-               current_classes,
-               difficultys,
-               metric,
-               min_overlaps,
-               compute_aos=False,
-               num_parts=200):
-    """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP.
-
-    Args:
-        gt_annos (dict): Must from get_label_annos() in kitti_common.py.
-        dt_annos (dict): Must from get_label_annos() in kitti_common.py.
-        current_classes (list[int]): 0: car, 1: pedestrian, 2: cyclist.
-        difficultys (list[int]): Eval difficulty, 0: easy, 1: normal, 2: hard
-        metric (int): Eval type. 0: bbox, 1: bev, 2: 3d
-        min_overlaps (float): Min overlap. format:
-            [num_overlap, metric, class].
-        num_parts (int): A parameter for fast calculate algorithm
-
-    Returns:
-        dict[str, np.ndarray]: recall, precision and aos
-    """
-    assert len(gt_annos) == len(dt_annos)
-    num_examples = len(gt_annos)
-    if num_examples < num_parts:
-        num_parts = num_examples
-    split_parts = get_split_parts(num_examples, num_parts)
-
-    rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts)
-    overlaps, parted_overlaps, total_dt_num, total_gt_num = rets
-    N_SAMPLE_PTS = 41
-    num_minoverlap = len(min_overlaps)
-    num_class = len(current_classes)
-    num_difficulty = len(difficultys)
-    precision = np.zeros(
-        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
-    recall = np.zeros(
-        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
-    aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
-    for m, current_class in enumerate(current_classes):
-        for idx_l, difficulty in enumerate(difficultys):
-            rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty)
-            (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets,
-             dontcares, total_dc_num, total_num_valid_gt) = rets
-            for k, min_overlap in enumerate(min_overlaps[:, metric, m]):
-                thresholdss = []
-                for i in range(len(gt_annos)):
-                    rets = compute_statistics_jit(
-                        overlaps[i],
-                        gt_datas_list[i],
-                        dt_datas_list[i],
-                        ignored_gts[i],
-                        ignored_dets[i],
-                        dontcares[i],
-                        metric,
-                        min_overlap=min_overlap,
-                        thresh=0.0,
-                        compute_fp=False)
-                    tp, fp, fn, similarity, thresholds = rets
-                    thresholdss += thresholds.tolist()
-                thresholdss = np.array(thresholdss)
-                thresholds = get_thresholds(thresholdss, total_num_valid_gt)
-                thresholds = np.array(thresholds)
-                pr = np.zeros([len(thresholds), 4])
-                idx = 0
-                for j, num_part in enumerate(split_parts):
-                    gt_datas_part = np.concatenate(
-                        gt_datas_list[idx:idx + num_part], 0)
-                    dt_datas_part = np.concatenate(
-                        dt_datas_list[idx:idx + num_part], 0)
-                    dc_datas_part = np.concatenate(
-                        dontcares[idx:idx + num_part], 0)
-                    ignored_dets_part = np.concatenate(
-                        ignored_dets[idx:idx + num_part], 0)
-                    ignored_gts_part = np.concatenate(
-                        ignored_gts[idx:idx + num_part], 0)
-                    fused_compute_statistics(
-                        parted_overlaps[j],
-                        pr,
-                        total_gt_num[idx:idx + num_part],
-                        total_dt_num[idx:idx + num_part],
-                        total_dc_num[idx:idx + num_part],
-                        gt_datas_part,
-                        dt_datas_part,
-                        dc_datas_part,
-                        ignored_gts_part,
-                        ignored_dets_part,
-                        metric,
-                        min_overlap=min_overlap,
-                        thresholds=thresholds,
-                        compute_aos=compute_aos)
-                    idx += num_part
-                for i in range(len(thresholds)):
-                    recall[m, idx_l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
-                    precision[m, idx_l, k, i] = pr[i, 0] / (
-                        pr[i, 0] + pr[i, 1])
-                    if compute_aos:
-                        aos[m, idx_l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1])
-                for i in range(len(thresholds)):
-                    precision[m, idx_l, k, i] = np.max(
-                        precision[m, idx_l, k, i:], axis=-1)
-                    recall[m, idx_l, k, i] = np.max(
-                        recall[m, idx_l, k, i:], axis=-1)
-                    if compute_aos:
-                        aos[m, idx_l, k, i] = np.max(
-                            aos[m, idx_l, k, i:], axis=-1)
-    ret_dict = {
-        'recall': recall,
-        'precision': precision,
-        'orientation': aos,
-    }
-
-    # clean temp variables
-    del overlaps
-    del parted_overlaps
-
-    gc.collect()
-    return ret_dict
-
-
-def get_mAP11(prec):
-    sums = 0
-    for i in range(0, prec.shape[-1], 4):
-        sums = sums + prec[..., i]
-    return sums / 11 * 100
-
-
-def get_mAP40(prec):
-    sums = 0
-    for i in range(1, prec.shape[-1]):
-        sums = sums + prec[..., i]
-    return sums / 40 * 100
-
-
-def print_str(value, *arg, sstream=None):
-    if sstream is None:
-        sstream = sysio.StringIO()
-    sstream.truncate(0)
-    sstream.seek(0)
-    print(value, *arg, file=sstream)
-    return sstream.getvalue()
-
-
-def do_eval(gt_annos,
-            dt_annos,
-            current_classes,
-            min_overlaps,
-            eval_types=['bbox', 'bev', '3d']):
-    # min_overlaps: [num_minoverlap, metric, num_class]
-    difficultys = [0, 1, 2]
-    mAP11_bbox = None
-    mAP11_aos = None
-    mAP40_bbox = None
-    mAP40_aos = None
-    if 'bbox' in eval_types:
-        ret = eval_class(
-            gt_annos,
-            dt_annos,
-            current_classes,
-            difficultys,
-            0,
-            min_overlaps,
-            compute_aos=('aos' in eval_types))
-        # ret: [num_class, num_diff, num_minoverlap, num_sample_points]
-        mAP11_bbox = get_mAP11(ret['precision'])
-        mAP40_bbox = get_mAP40(ret['precision'])
-        if 'aos' in eval_types:
-            mAP11_aos = get_mAP11(ret['orientation'])
-            mAP40_aos = get_mAP40(ret['orientation'])
-
-    mAP11_bev = None
-    mAP40_bev = None
-    if 'bev' in eval_types:
-        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1,
-                         min_overlaps)
-        mAP11_bev = get_mAP11(ret['precision'])
-        mAP40_bev = get_mAP40(ret['precision'])
-
-    mAP11_3d = None
-    mAP40_3d = None
-    if '3d' in eval_types:
-        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2,
-                         min_overlaps)
-        mAP11_3d = get_mAP11(ret['precision'])
-        mAP40_3d = get_mAP40(ret['precision'])
-    return (mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox, mAP40_bev,
-            mAP40_3d, mAP40_aos)
-
-
-def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges,
-                       compute_aos):
-    # overlap_ranges: [range, metric, num_class]
-    min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]])
-    for i in range(overlap_ranges.shape[1]):
-        for j in range(overlap_ranges.shape[2]):
-            min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j])
-    mAP_bbox, mAP_bev, mAP_3d, mAP_aos, _, _, \
-        _, _ = do_eval(gt_annos, dt_annos,
-                       current_classes, min_overlaps,
-                       compute_aos)
-    # ret: [num_class, num_diff, num_minoverlap]
-    mAP_bbox = mAP_bbox.mean(-1)
-    mAP_bev = mAP_bev.mean(-1)
-    mAP_3d = mAP_3d.mean(-1)
-    if mAP_aos is not None:
-        mAP_aos = mAP_aos.mean(-1)
-    return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
-
-
-def kitti_eval(gt_annos,
-               dt_annos,
-               current_classes,
-               eval_types=['bbox', 'bev', '3d']):
-    """KITTI evaluation.
-
-    Args:
-        gt_annos (list[dict]): Contain gt information of each sample.
-        dt_annos (list[dict]): Contain detected information of each sample.
-        current_classes (list[str]): Classes to evaluation.
-        eval_types (list[str], optional): Types to eval.
-            Defaults to ['bbox', 'bev', '3d'].
-
-    Returns:
-        tuple: String and dict of evaluation results.
-    """
-    assert len(eval_types) > 0, 'must contain at least one evaluation type'
-    if 'aos' in eval_types:
-        assert 'bbox' in eval_types, 'must evaluate bbox when evaluating aos'
-    overlap_0_7 = np.array([[0.7, 0.5, 0.5, 0.7,
-                             0.5], [0.7, 0.5, 0.5, 0.7, 0.5],
-                            [0.7, 0.5, 0.5, 0.7, 0.5]])
-    overlap_0_5 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5],
-                            [0.5, 0.25, 0.25, 0.5, 0.25],
-                            [0.5, 0.25, 0.25, 0.5, 0.25]])
-    min_overlaps = np.stack([overlap_0_7, overlap_0_5], axis=0)  # [2, 3, 5]
-    class_to_name = {
-        0: 'Car',
-        1: 'Pedestrian',
-        2: 'Cyclist',
-        3: 'Van',
-        4: 'Person_sitting',
-    }
-    name_to_class = {v: n for n, v in class_to_name.items()}
-    if not isinstance(current_classes, (list, tuple)):
-        current_classes = [current_classes]
-    current_classes_int = []
-    for curcls in current_classes:
-        if isinstance(curcls, str):
-            current_classes_int.append(name_to_class[curcls])
-        else:
-            current_classes_int.append(curcls)
-    current_classes = current_classes_int
-    min_overlaps = min_overlaps[:, :, current_classes]
-    result = ''
-    # check whether alpha is valid
-    compute_aos = False
-    pred_alpha = False
-    valid_alpha_gt = False
-    for anno in dt_annos:
-        mask = (anno['alpha'] != -10)
-        if anno['alpha'][mask].shape[0] != 0:
-            pred_alpha = True
-            break
-    for anno in gt_annos:
-        if anno['alpha'][0] != -10:
-            valid_alpha_gt = True
-            break
-    compute_aos = (pred_alpha and valid_alpha_gt)
-    if compute_aos:
-        eval_types.append('aos')
-
-    mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox, mAP40_bev, \
-        mAP40_3d, mAP40_aos = do_eval(gt_annos, dt_annos,
-                                      current_classes, min_overlaps,
-                                      eval_types)
-
-    ret_dict = {}
-    difficulty = ['easy', 'moderate', 'hard']
-
-    # calculate AP11
-    result += '\n----------- AP11 Results ------------\n\n'
-    for j, curcls in enumerate(current_classes):
-        # mAP threshold array: [num_minoverlap, metric, class]
-        # mAP result: [num_class, num_diff, num_minoverlap]
-        curcls_name = class_to_name[curcls]
-        for i in range(min_overlaps.shape[0]):
-            # prepare results for print
-            result += ('{} AP11@{:.2f}, {:.2f}, {:.2f}:\n'.format(
-                curcls_name, *min_overlaps[i, :, j]))
-            if mAP11_bbox is not None:
-                result += 'bbox AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
-                    *mAP11_bbox[j, :, i])
-            if mAP11_bev is not None:
-                result += 'bev  AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
-                    *mAP11_bev[j, :, i])
-            if mAP11_3d is not None:
-                result += '3d   AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
-                    *mAP11_3d[j, :, i])
-            if compute_aos:
-                result += 'aos  AP11:{:.2f}, {:.2f}, {:.2f}\n'.format(
-                    *mAP11_aos[j, :, i])
-
-            # prepare results for logger
-            for idx in range(3):
-                if i == 0:
-                    postfix = f'{difficulty[idx]}_strict'
-                else:
-                    postfix = f'{difficulty[idx]}_loose'
-                prefix = f'KITTI/{curcls_name}'
-                if mAP11_3d is not None:
-                    ret_dict[f'{prefix}_3D_AP11_{postfix}'] =\
-                        mAP11_3d[j, idx, i]
-                if mAP11_bev is not None:
-                    ret_dict[f'{prefix}_BEV_AP11_{postfix}'] =\
-                        mAP11_bev[j, idx, i]
-                if mAP11_bbox is not None:
-                    ret_dict[f'{prefix}_2D_AP11_{postfix}'] =\
-                        mAP11_bbox[j, idx, i]
-
-    # calculate mAP11 over all classes if there are multiple classes
-    if len(current_classes) > 1:
-        # prepare results for print
-        result += ('\nOverall AP11@{}, {}, {}:\n'.format(*difficulty))
-        if mAP11_bbox is not None:
-            mAP11_bbox = mAP11_bbox.mean(axis=0)
-            result += 'bbox AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
-                *mAP11_bbox[:, 0])
-        if mAP11_bev is not None:
-            mAP11_bev = mAP11_bev.mean(axis=0)
-            result += 'bev  AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
-                *mAP11_bev[:, 0])
-        if mAP11_3d is not None:
-            mAP11_3d = mAP11_3d.mean(axis=0)
-            result += '3d   AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP11_3d[:,
-                                                                            0])
-        if compute_aos:
-            mAP11_aos = mAP11_aos.mean(axis=0)
-            result += 'aos  AP11:{:.2f}, {:.2f}, {:.2f}\n'.format(
-                *mAP11_aos[:, 0])
-
-        # prepare results for logger
-        for idx in range(3):
-            postfix = f'{difficulty[idx]}'
-            if mAP11_3d is not None:
-                ret_dict[f'KITTI/Overall_3D_AP11_{postfix}'] = mAP11_3d[idx, 0]
-            if mAP11_bev is not None:
-                ret_dict[f'KITTI/Overall_BEV_AP11_{postfix}'] =\
-                    mAP11_bev[idx, 0]
-            if mAP11_bbox is not None:
-                ret_dict[f'KITTI/Overall_2D_AP11_{postfix}'] =\
-                    mAP11_bbox[idx, 0]
-
-    # Calculate AP40
-    result += '\n----------- AP40 Results ------------\n\n'
-    for j, curcls in enumerate(current_classes):
-        # mAP threshold array: [num_minoverlap, metric, class]
-        # mAP result: [num_class, num_diff, num_minoverlap]
-        curcls_name = class_to_name[curcls]
-        for i in range(min_overlaps.shape[0]):
-            # prepare results for print
-            result += ('{} AP40@{:.2f}, {:.2f}, {:.2f}:\n'.format(
-                curcls_name, *min_overlaps[i, :, j]))
-            if mAP40_bbox is not None:
-                result += 'bbox AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
-                    *mAP40_bbox[j, :, i])
-            if mAP40_bev is not None:
-                result += 'bev  AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
-                    *mAP40_bev[j, :, i])
-            if mAP40_3d is not None:
-                result += '3d   AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
-                    *mAP40_3d[j, :, i])
-            if compute_aos:
-                result += 'aos  AP40:{:.2f}, {:.2f}, {:.2f}\n'.format(
-                    *mAP40_aos[j, :, i])
-
-            # prepare results for logger
-            for idx in range(3):
-                if i == 0:
-                    postfix = f'{difficulty[idx]}_strict'
-                else:
-                    postfix = f'{difficulty[idx]}_loose'
-                prefix = f'KITTI/{curcls_name}'
-                if mAP40_3d is not None:
-                    ret_dict[f'{prefix}_3D_AP40_{postfix}'] =\
-                        mAP40_3d[j, idx, i]
-                if mAP40_bev is not None:
-                    ret_dict[f'{prefix}_BEV_AP40_{postfix}'] =\
-                        mAP40_bev[j, idx, i]
-                if mAP40_bbox is not None:
-                    ret_dict[f'{prefix}_2D_AP40_{postfix}'] =\
-                        mAP40_bbox[j, idx, i]
-
-    # calculate mAP40 over all classes if there are multiple classes
-    if len(current_classes) > 1:
-        # prepare results for print
-        result += ('\nOverall AP40@{}, {}, {}:\n'.format(*difficulty))
-        if mAP40_bbox is not None:
-            mAP40_bbox = mAP40_bbox.mean(axis=0)
-            result += 'bbox AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
-                *mAP40_bbox[:, 0])
-        if mAP40_bev is not None:
-            mAP40_bev = mAP40_bev.mean(axis=0)
-            result += 'bev  AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
-                *mAP40_bev[:, 0])
-        if mAP40_3d is not None:
-            mAP40_3d = mAP40_3d.mean(axis=0)
-            result += '3d   AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP40_3d[:,
-                                                                            0])
-        if compute_aos:
-            mAP40_aos = mAP40_aos.mean(axis=0)
-            result += 'aos  AP40:{:.2f}, {:.2f}, {:.2f}\n'.format(
-                *mAP40_aos[:, 0])
-
-        # prepare results for logger
-        for idx in range(3):
-            postfix = f'{difficulty[idx]}'
-            if mAP40_3d is not None:
-                ret_dict[f'KITTI/Overall_3D_AP40_{postfix}'] = mAP40_3d[idx, 0]
-            if mAP40_bev is not None:
-                ret_dict[f'KITTI/Overall_BEV_AP40_{postfix}'] =\
-                    mAP40_bev[idx, 0]
-            if mAP40_bbox is not None:
-                ret_dict[f'KITTI/Overall_2D_AP40_{postfix}'] =\
-                    mAP40_bbox[idx, 0]
-
-    return result, ret_dict
-
-
-def kitti_eval_coco_style(gt_annos, dt_annos, current_classes):
-    """coco style evaluation of kitti.
-
-    Args:
-        gt_annos (list[dict]): Contain gt information of each sample.
-        dt_annos (list[dict]): Contain detected information of each sample.
-        current_classes (list[str]): Classes to evaluation.
-
-    Returns:
-        string: Evaluation results.
-    """
-    class_to_name = {
-        0: 'Car',
-        1: 'Pedestrian',
-        2: 'Cyclist',
-        3: 'Van',
-        4: 'Person_sitting',
-    }
-    class_to_range = {
-        0: [0.5, 0.95, 10],
-        1: [0.25, 0.7, 10],
-        2: [0.25, 0.7, 10],
-        3: [0.5, 0.95, 10],
-        4: [0.25, 0.7, 10],
-    }
-    name_to_class = {v: n for n, v in class_to_name.items()}
-    if not isinstance(current_classes, (list, tuple)):
-        current_classes = [current_classes]
-    current_classes_int = []
-    for curcls in current_classes:
-        if isinstance(curcls, str):
-            current_classes_int.append(name_to_class[curcls])
-        else:
-            current_classes_int.append(curcls)
-    current_classes = current_classes_int
-    overlap_ranges = np.zeros([3, 3, len(current_classes)])
-    for i, curcls in enumerate(current_classes):
-        overlap_ranges[:, :, i] = np.array(class_to_range[curcls])[:,
-                                                                   np.newaxis]
-    result = ''
-    # check whether alpha is valid
-    compute_aos = False
-    for anno in dt_annos:
-        if anno['alpha'].shape[0] != 0:
-            if anno['alpha'][0] != -10:
-                compute_aos = True
-            break
-    mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval(
-        gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos)
-    for j, curcls in enumerate(current_classes):
-        # mAP threshold array: [num_minoverlap, metric, class]
-        # mAP result: [num_class, num_diff, num_minoverlap]
-        o_range = np.array(class_to_range[curcls])[[0, 2, 1]]
-        o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1)
-        result += print_str((f'{class_to_name[curcls]} '
-                             'coco AP@{:.2f}:{:.2f}:{:.2f}:'.format(*o_range)))
-        result += print_str((f'bbox AP:{mAPbbox[j, 0]:.2f}, '
-                             f'{mAPbbox[j, 1]:.2f}, '
-                             f'{mAPbbox[j, 2]:.2f}'))
-        result += print_str((f'bev  AP:{mAPbev[j, 0]:.2f}, '
-                             f'{mAPbev[j, 1]:.2f}, '
-                             f'{mAPbev[j, 2]:.2f}'))
-        result += print_str((f'3d   AP:{mAP3d[j, 0]:.2f}, '
-                             f'{mAP3d[j, 1]:.2f}, '
-                             f'{mAP3d[j, 2]:.2f}'))
-        if compute_aos:
-            result += print_str((f'aos  AP:{mAPaos[j, 0]:.2f}, '
-                                 f'{mAPaos[j, 1]:.2f}, '
-                                 f'{mAPaos[j, 2]:.2f}'))
-    return result
+# Copyright (c) OpenMMLab. All rights reserved.
+import gc
+import io as sysio
+
+import numba
+import numpy as np
+
+
+@numba.jit
+def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41):
+    scores.sort()
+    scores = scores[::-1]
+    current_recall = 0
+    thresholds = []
+    for i, score in enumerate(scores):
+        l_recall = (i + 1) / num_gt
+        if i < (len(scores) - 1):
+            r_recall = (i + 2) / num_gt
+        else:
+            r_recall = l_recall
+        if (((r_recall - current_recall) < (current_recall - l_recall))
+                and (i < (len(scores) - 1))):
+            continue
+        # recall = l_recall
+        thresholds.append(score)
+        current_recall += 1 / (num_sample_pts - 1.0)
+    return thresholds
+
+
+def clean_data(gt_anno, dt_anno, current_class, difficulty):
+    CLASS_NAMES = ['car', 'pedestrian', 'cyclist']
+    MIN_HEIGHT = [40, 25, 25]
+    MAX_OCCLUSION = [0, 1, 2]
+    MAX_TRUNCATION = [0.15, 0.3, 0.5]
+    dc_bboxes, ignored_gt, ignored_dt = [], [], []
+    current_cls_name = CLASS_NAMES[current_class].lower()
+    num_gt = len(gt_anno['name'])
+    num_dt = len(dt_anno['name'])
+    num_valid_gt = 0
+    for i in range(num_gt):
+        bbox = gt_anno['bbox'][i]
+        gt_name = gt_anno['name'][i].lower()
+        height = bbox[3] - bbox[1]
+        valid_class = -1
+        if (gt_name == current_cls_name):
+            valid_class = 1
+        elif (current_cls_name == 'Pedestrian'.lower()
+              and 'Person_sitting'.lower() == gt_name):
+            valid_class = 0
+        elif (current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name):
+            valid_class = 0
+        else:
+            valid_class = -1
+        ignore = False
+        if ((gt_anno['occluded'][i] > MAX_OCCLUSION[difficulty])
+                or (gt_anno['truncated'][i] > MAX_TRUNCATION[difficulty])
+                or (height <= MIN_HEIGHT[difficulty])):
+            ignore = True
+        if valid_class == 1 and not ignore:
+            ignored_gt.append(0)
+            num_valid_gt += 1
+        elif (valid_class == 0 or (ignore and (valid_class == 1))):
+            ignored_gt.append(1)
+        else:
+            ignored_gt.append(-1)
+    # for i in range(num_gt):
+        if gt_anno['name'][i] == 'DontCare':
+            dc_bboxes.append(gt_anno['bbox'][i])
+    for i in range(num_dt):
+        if (dt_anno['name'][i].lower() == current_cls_name):
+            valid_class = 1
+        else:
+            valid_class = -1
+        height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1])
+        if height < MIN_HEIGHT[difficulty]:
+            ignored_dt.append(1)
+        elif valid_class == 1:
+            ignored_dt.append(0)
+        else:
+            ignored_dt.append(-1)
+
+    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
+
+
+@numba.jit(nopython=True)
+def image_box_overlap(boxes, query_boxes, criterion=-1):
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) *
+                     (query_boxes[k, 3] - query_boxes[k, 1]))
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]))
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]))
+                if ih > 0:
+                    if criterion == -1:
+                        ua = ((boxes[n, 2] - boxes[n, 0]) *
+                              (boxes[n, 3] - boxes[n, 1]) + qbox_area -
+                              iw * ih)
+                    elif criterion == 0:
+                        ua = ((boxes[n, 2] - boxes[n, 0]) *
+                              (boxes[n, 3] - boxes[n, 1]))
+                    elif criterion == 1:
+                        ua = qbox_area
+                    else:
+                        ua = 1.0
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def bev_box_overlap(boxes, qboxes, criterion=-1):
+    from .rotate_iou import rotate_iou_gpu_eval
+    riou = rotate_iou_gpu_eval(boxes, qboxes, criterion)
+    return riou
+
+
+@numba.jit(nopython=True, parallel=True)
+def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1):
+    # ONLY support overlap in CAMERA, not lidar.
+    # TODO: change to use prange for parallel mode, should check the difference
+    N, K = boxes.shape[0], qboxes.shape[0]
+    for i in numba.prange(N):
+        for j in numba.prange(K):
+            if rinc[i, j] > 0:
+                # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] +
+                #         qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1]))
+                iw = (
+                    min(boxes[i, 1], qboxes[j, 1]) -
+                    max(boxes[i, 1] - boxes[i, 4],
+                        qboxes[j, 1] - qboxes[j, 4]))
+
+                if iw > 0:
+                    area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5]
+                    area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5]
+                    inc = iw * rinc[i, j]
+                    if criterion == -1:
+                        ua = (area1 + area2 - inc)
+                    elif criterion == 0:
+                        ua = area1
+                    elif criterion == 1:
+                        ua = area2
+                    else:
+                        ua = inc
+                    rinc[i, j] = inc / ua
+                else:
+                    rinc[i, j] = 0.0
+
+
+def d3_box_overlap(boxes, qboxes, criterion=-1):
+    from .rotate_iou import rotate_iou_gpu_eval
+    rinc = rotate_iou_gpu_eval(boxes[:, [0, 2, 3, 5, 6]],
+                               qboxes[:, [0, 2, 3, 5, 6]], 2)
+    d3_box_overlap_kernel(boxes, qboxes, rinc, criterion)
+    return rinc
+
+
+@numba.jit(nopython=True)
+def compute_statistics_jit(overlaps,
+                           gt_datas,
+                           dt_datas,
+                           ignored_gt,
+                           ignored_det,
+                           dc_bboxes,
+                           metric,
+                           min_overlap,
+                           thresh=0,
+                           compute_fp=False,
+                           compute_aos=False):
+
+    det_size = dt_datas.shape[0]
+    gt_size = gt_datas.shape[0]
+    dt_scores = dt_datas[:, -1]
+    dt_alphas = dt_datas[:, 4]
+    gt_alphas = gt_datas[:, 4]
+    dt_bboxes = dt_datas[:, :4]
+    # gt_bboxes = gt_datas[:, :4]
+
+    assigned_detection = [False] * det_size
+    ignored_threshold = [False] * det_size
+    if compute_fp:
+        for i in range(det_size):
+            if (dt_scores[i] < thresh):
+                ignored_threshold[i] = True
+    NO_DETECTION = -10000000
+    tp, fp, fn, similarity = 0, 0, 0, 0
+    # thresholds = [0.0]
+    # delta = [0.0]
+    thresholds = np.zeros((gt_size, ))
+    thresh_idx = 0
+    delta = np.zeros((gt_size, ))
+    delta_idx = 0
+    for i in range(gt_size):
+        if ignored_gt[i] == -1:
+            continue
+        det_idx = -1
+        valid_detection = NO_DETECTION
+        max_overlap = 0
+        assigned_ignored_det = False
+
+        for j in range(det_size):
+            if (ignored_det[j] == -1):
+                continue
+            if (assigned_detection[j]):
+                continue
+            if (ignored_threshold[j]):
+                continue
+            overlap = overlaps[j, i]
+            dt_score = dt_scores[j]
+            if (not compute_fp and (overlap > min_overlap)
+                    and dt_score > valid_detection):
+                det_idx = j
+                valid_detection = dt_score
+            elif (compute_fp and (overlap > min_overlap)
+                  and (overlap > max_overlap or assigned_ignored_det)
+                  and ignored_det[j] == 0):
+                max_overlap = overlap
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = False
+            elif (compute_fp and (overlap > min_overlap)
+                  and (valid_detection == NO_DETECTION)
+                  and ignored_det[j] == 1):
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = True
+
+        if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0:
+            fn += 1
+        elif ((valid_detection != NO_DETECTION)
+              and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)):
+            assigned_detection[det_idx] = True
+        elif valid_detection != NO_DETECTION:
+            tp += 1
+            # thresholds.append(dt_scores[det_idx])
+            thresholds[thresh_idx] = dt_scores[det_idx]
+            thresh_idx += 1
+            if compute_aos:
+                # delta.append(gt_alphas[i] - dt_alphas[det_idx])
+                delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx]
+                delta_idx += 1
+
+            assigned_detection[det_idx] = True
+    if compute_fp:
+        for i in range(det_size):
+            if (not (assigned_detection[i] or ignored_det[i] == -1
+                     or ignored_det[i] == 1 or ignored_threshold[i])):
+                fp += 1
+        nstuff = 0
+        if metric == 0:
+            overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0)
+            for i in range(dc_bboxes.shape[0]):
+                for j in range(det_size):
+                    if (assigned_detection[j]):
+                        continue
+                    if (ignored_det[j] == -1 or ignored_det[j] == 1):
+                        continue
+                    if (ignored_threshold[j]):
+                        continue
+                    if overlaps_dt_dc[j, i] > min_overlap:
+                        assigned_detection[j] = True
+                        nstuff += 1
+        fp -= nstuff
+        if compute_aos:
+            tmp = np.zeros((fp + delta_idx, ))
+            # tmp = [0] * fp
+            for i in range(delta_idx):
+                tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0
+                # tmp.append((1.0 + np.cos(delta[i])) / 2.0)
+            # assert len(tmp) == fp + tp
+            # assert len(delta) == tp
+            if tp > 0 or fp > 0:
+                similarity = np.sum(tmp)
+            else:
+                similarity = -1
+    return tp, fp, fn, similarity, thresholds[:thresh_idx]
+
+
+def get_split_parts(num, num_part):
+    same_part = num // num_part
+    remain_num = num % num_part
+    if remain_num == 0:
+        return [same_part] * num_part
+    else:
+        return [same_part] * num_part + [remain_num]
+
+
+@numba.jit(nopython=True)
+def fused_compute_statistics(overlaps,
+                             pr,
+                             gt_nums,
+                             dt_nums,
+                             dc_nums,
+                             gt_datas,
+                             dt_datas,
+                             dontcares,
+                             ignored_gts,
+                             ignored_dets,
+                             metric,
+                             min_overlap,
+                             thresholds,
+                             compute_aos=False):
+    gt_num = 0
+    dt_num = 0
+    dc_num = 0
+    for i in range(gt_nums.shape[0]):
+        for t, thresh in enumerate(thresholds):
+            overlap = overlaps[dt_num:dt_num + dt_nums[i],
+                               gt_num:gt_num + gt_nums[i]]
+
+            gt_data = gt_datas[gt_num:gt_num + gt_nums[i]]
+            dt_data = dt_datas[dt_num:dt_num + dt_nums[i]]
+            ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]]
+            ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]]
+            dontcare = dontcares[dc_num:dc_num + dc_nums[i]]
+            tp, fp, fn, similarity, _ = compute_statistics_jit(
+                overlap,
+                gt_data,
+                dt_data,
+                ignored_gt,
+                ignored_det,
+                dontcare,
+                metric,
+                min_overlap=min_overlap,
+                thresh=thresh,
+                compute_fp=True,
+                compute_aos=compute_aos)
+            pr[t, 0] += tp
+            pr[t, 1] += fp
+            pr[t, 2] += fn
+            if similarity != -1:
+                pr[t, 3] += similarity
+        gt_num += gt_nums[i]
+        dt_num += dt_nums[i]
+        dc_num += dc_nums[i]
+
+
+def calculate_iou_partly(gt_annos, dt_annos, metric, num_parts=50):
+    """Fast iou algorithm. this function can be used independently to do result
+    analysis. Must be used in CAMERA coordinate system.
+
+    Args:
+        gt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        dt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        metric (int): Eval type. 0: bbox, 1: bev, 2: 3d.
+        num_parts (int): A parameter for fast calculate algorithm.
+    """
+    assert len(gt_annos) == len(dt_annos)
+    total_dt_num = np.stack([len(a['name']) for a in dt_annos], 0)
+    total_gt_num = np.stack([len(a['name']) for a in gt_annos], 0)
+    num_examples = len(gt_annos)
+    split_parts = get_split_parts(num_examples, num_parts)
+    parted_overlaps = []
+    example_idx = 0
+
+    for num_part in split_parts:
+        gt_annos_part = gt_annos[example_idx:example_idx + num_part]
+        dt_annos_part = dt_annos[example_idx:example_idx + num_part]
+        if metric == 0:
+            gt_boxes = np.concatenate([a['bbox'] for a in gt_annos_part], 0)
+            dt_boxes = np.concatenate([a['bbox'] for a in dt_annos_part], 0)
+            overlap_part = image_box_overlap(gt_boxes, dt_boxes)
+        elif metric == 1:
+            loc = np.concatenate(
+                [a['location'][:, [0, 2]] for a in gt_annos_part], 0)
+            dims = np.concatenate(
+                [a['dimensions'][:, [0, 2]] for a in gt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            loc = np.concatenate(
+                [a['location'][:, [0, 2]] for a in dt_annos_part], 0)
+            dims = np.concatenate(
+                [a['dimensions'][:, [0, 2]] for a in dt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            overlap_part = bev_box_overlap(gt_boxes,
+                                           dt_boxes).astype(np.float64)
+        elif metric == 2:
+            loc = np.concatenate([a['location'] for a in gt_annos_part], 0)
+            dims = np.concatenate([a['dimensions'] for a in gt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            loc = np.concatenate([a['location'] for a in dt_annos_part], 0)
+            dims = np.concatenate([a['dimensions'] for a in dt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            overlap_part = d3_box_overlap(gt_boxes,
+                                          dt_boxes).astype(np.float64)
+        else:
+            raise ValueError('unknown metric')
+        parted_overlaps.append(overlap_part)
+        example_idx += num_part
+    overlaps = []
+    example_idx = 0
+    for j, num_part in enumerate(split_parts):
+        gt_annos_part = gt_annos[example_idx:example_idx + num_part]
+        dt_annos_part = dt_annos[example_idx:example_idx + num_part]
+        gt_num_idx, dt_num_idx = 0, 0
+        for i in range(num_part):
+            gt_box_num = total_gt_num[example_idx + i]
+            dt_box_num = total_dt_num[example_idx + i]
+            overlaps.append(
+                parted_overlaps[j][gt_num_idx:gt_num_idx + gt_box_num,
+                                   dt_num_idx:dt_num_idx + dt_box_num])
+            gt_num_idx += gt_box_num
+            dt_num_idx += dt_box_num
+        example_idx += num_part
+
+    return overlaps, parted_overlaps, total_gt_num, total_dt_num
+
+
+def _prepare_data(gt_annos, dt_annos, current_class, difficulty):
+    gt_datas_list = []
+    dt_datas_list = []
+    total_dc_num = []
+    ignored_gts, ignored_dets, dontcares = [], [], []
+    total_num_valid_gt = 0
+    for i in range(len(gt_annos)):
+        rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty)
+        num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets
+        ignored_gts.append(np.array(ignored_gt, dtype=np.int64))
+        ignored_dets.append(np.array(ignored_det, dtype=np.int64))
+        if len(dc_bboxes) == 0:
+            dc_bboxes = np.zeros((0, 4)).astype(np.float64)
+        else:
+            dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64)
+        total_dc_num.append(dc_bboxes.shape[0])
+        dontcares.append(dc_bboxes)
+        total_num_valid_gt += num_valid_gt
+        gt_datas = np.concatenate(
+            [gt_annos[i]['bbox'], gt_annos[i]['alpha'][..., np.newaxis]], 1)
+        dt_datas = np.concatenate([
+            dt_annos[i]['bbox'], dt_annos[i]['alpha'][..., np.newaxis],
+            dt_annos[i]['score'][..., np.newaxis]
+        ], 1)
+        gt_datas_list.append(gt_datas)
+        dt_datas_list.append(dt_datas)
+    total_dc_num = np.stack(total_dc_num, axis=0)
+    return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares,
+            total_dc_num, total_num_valid_gt)
+
+
+def eval_class(gt_annos,
+               dt_annos,
+               current_classes,
+               difficultys,
+               metric,
+               min_overlaps,
+               compute_aos=False,
+               num_parts=200):
+    """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP.
+
+    Args:
+        gt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        dt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        current_classes (list[int]): 0: car, 1: pedestrian, 2: cyclist.
+        difficultys (list[int]): Eval difficulty, 0: easy, 1: normal, 2: hard
+        metric (int): Eval type. 0: bbox, 1: bev, 2: 3d
+        min_overlaps (float): Min overlap. format:
+            [num_overlap, metric, class].
+        num_parts (int): A parameter for fast calculate algorithm
+
+    Returns:
+        dict[str, np.ndarray]: recall, precision and aos
+    """
+    assert len(gt_annos) == len(dt_annos)
+    num_examples = len(gt_annos)
+    if num_examples < num_parts:
+        num_parts = num_examples
+    split_parts = get_split_parts(num_examples, num_parts)
+
+    rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts)
+    overlaps, parted_overlaps, total_dt_num, total_gt_num = rets
+    N_SAMPLE_PTS = 41
+    num_minoverlap = len(min_overlaps)
+    num_class = len(current_classes)
+    num_difficulty = len(difficultys)
+    precision = np.zeros(
+        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    recall = np.zeros(
+        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    for m, current_class in enumerate(current_classes):
+        for idx_l, difficulty in enumerate(difficultys):
+            rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty)
+            (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets,
+             dontcares, total_dc_num, total_num_valid_gt) = rets
+            for k, min_overlap in enumerate(min_overlaps[:, metric, m]):
+                thresholdss = []
+                for i in range(len(gt_annos)):
+                    rets = compute_statistics_jit(
+                        overlaps[i],
+                        gt_datas_list[i],
+                        dt_datas_list[i],
+                        ignored_gts[i],
+                        ignored_dets[i],
+                        dontcares[i],
+                        metric,
+                        min_overlap=min_overlap,
+                        thresh=0.0,
+                        compute_fp=False)
+                    tp, fp, fn, similarity, thresholds = rets
+                    thresholdss += thresholds.tolist()
+                thresholdss = np.array(thresholdss)
+                thresholds = get_thresholds(thresholdss, total_num_valid_gt)
+                thresholds = np.array(thresholds)
+                pr = np.zeros([len(thresholds), 4])
+                idx = 0
+                for j, num_part in enumerate(split_parts):
+                    gt_datas_part = np.concatenate(
+                        gt_datas_list[idx:idx + num_part], 0)
+                    dt_datas_part = np.concatenate(
+                        dt_datas_list[idx:idx + num_part], 0)
+                    dc_datas_part = np.concatenate(
+                        dontcares[idx:idx + num_part], 0)
+                    ignored_dets_part = np.concatenate(
+                        ignored_dets[idx:idx + num_part], 0)
+                    ignored_gts_part = np.concatenate(
+                        ignored_gts[idx:idx + num_part], 0)
+                    fused_compute_statistics(
+                        parted_overlaps[j],
+                        pr,
+                        total_gt_num[idx:idx + num_part],
+                        total_dt_num[idx:idx + num_part],
+                        total_dc_num[idx:idx + num_part],
+                        gt_datas_part,
+                        dt_datas_part,
+                        dc_datas_part,
+                        ignored_gts_part,
+                        ignored_dets_part,
+                        metric,
+                        min_overlap=min_overlap,
+                        thresholds=thresholds,
+                        compute_aos=compute_aos)
+                    idx += num_part
+                for i in range(len(thresholds)):
+                    recall[m, idx_l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
+                    precision[m, idx_l, k, i] = pr[i, 0] / (
+                        pr[i, 0] + pr[i, 1])
+                    if compute_aos:
+                        aos[m, idx_l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1])
+                for i in range(len(thresholds)):
+                    precision[m, idx_l, k, i] = np.max(
+                        precision[m, idx_l, k, i:], axis=-1)
+                    recall[m, idx_l, k, i] = np.max(
+                        recall[m, idx_l, k, i:], axis=-1)
+                    if compute_aos:
+                        aos[m, idx_l, k, i] = np.max(
+                            aos[m, idx_l, k, i:], axis=-1)
+    ret_dict = {
+        'recall': recall,
+        'precision': precision,
+        'orientation': aos,
+    }
+
+    # clean temp variables
+    del overlaps
+    del parted_overlaps
+
+    gc.collect()
+    return ret_dict
+
+
+def get_mAP11(prec):
+    sums = 0
+    for i in range(0, prec.shape[-1], 4):
+        sums = sums + prec[..., i]
+    return sums / 11 * 100
+
+
+def get_mAP40(prec):
+    sums = 0
+    for i in range(1, prec.shape[-1]):
+        sums = sums + prec[..., i]
+    return sums / 40 * 100
+
+
+def print_str(value, *arg, sstream=None):
+    if sstream is None:
+        sstream = sysio.StringIO()
+    sstream.truncate(0)
+    sstream.seek(0)
+    print(value, *arg, file=sstream)
+    return sstream.getvalue()
+
+
+def do_eval(gt_annos,
+            dt_annos,
+            current_classes,
+            min_overlaps,
+            eval_types=['bbox', 'bev', '3d']):
+    # min_overlaps: [num_minoverlap, metric, num_class]
+    difficultys = [0, 1, 2]
+    mAP11_bbox = None
+    mAP11_aos = None
+    mAP40_bbox = None
+    mAP40_aos = None
+    if 'bbox' in eval_types:
+        ret = eval_class(
+            gt_annos,
+            dt_annos,
+            current_classes,
+            difficultys,
+            0,
+            min_overlaps,
+            compute_aos=('aos' in eval_types))
+        # ret: [num_class, num_diff, num_minoverlap, num_sample_points]
+        mAP11_bbox = get_mAP11(ret['precision'])
+        mAP40_bbox = get_mAP40(ret['precision'])
+        if 'aos' in eval_types:
+            mAP11_aos = get_mAP11(ret['orientation'])
+            mAP40_aos = get_mAP40(ret['orientation'])
+
+    mAP11_bev = None
+    mAP40_bev = None
+    if 'bev' in eval_types:
+        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1,
+                         min_overlaps)
+        mAP11_bev = get_mAP11(ret['precision'])
+        mAP40_bev = get_mAP40(ret['precision'])
+
+    mAP11_3d = None
+    mAP40_3d = None
+    if '3d' in eval_types:
+        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2,
+                         min_overlaps)
+        mAP11_3d = get_mAP11(ret['precision'])
+        mAP40_3d = get_mAP40(ret['precision'])
+    return (mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox, mAP40_bev,
+            mAP40_3d, mAP40_aos)
+
+
+def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges,
+                       compute_aos):
+    # overlap_ranges: [range, metric, num_class]
+    min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]])
+    for i in range(overlap_ranges.shape[1]):
+        for j in range(overlap_ranges.shape[2]):
+            min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j])
+    mAP_bbox, mAP_bev, mAP_3d, mAP_aos, _, _, \
+        _, _ = do_eval(gt_annos, dt_annos,
+                       current_classes, min_overlaps,
+                       compute_aos)
+    # ret: [num_class, num_diff, num_minoverlap]
+    mAP_bbox = mAP_bbox.mean(-1)
+    mAP_bev = mAP_bev.mean(-1)
+    mAP_3d = mAP_3d.mean(-1)
+    if mAP_aos is not None:
+        mAP_aos = mAP_aos.mean(-1)
+    return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
+
+
+def kitti_eval(gt_annos,
+               dt_annos,
+               current_classes,
+               eval_types=['bbox', 'bev', '3d']):
+    """KITTI evaluation.
+
+    Args:
+        gt_annos (list[dict]): Contain gt information of each sample.
+        dt_annos (list[dict]): Contain detected information of each sample.
+        current_classes (list[str]): Classes to evaluation.
+        eval_types (list[str], optional): Types to eval.
+            Defaults to ['bbox', 'bev', '3d'].
+
+    Returns:
+        tuple: String and dict of evaluation results.
+    """
+    assert len(eval_types) > 0, 'must contain at least one evaluation type'
+    if 'aos' in eval_types:
+        assert 'bbox' in eval_types, 'must evaluate bbox when evaluating aos'
+    overlap_0_7 = np.array([[0.7, 0.5, 0.5, 0.7,
+                             0.5], [0.7, 0.5, 0.5, 0.7, 0.5],
+                            [0.7, 0.5, 0.5, 0.7, 0.5]])
+    overlap_0_5 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5],
+                            [0.5, 0.25, 0.25, 0.5, 0.25],
+                            [0.5, 0.25, 0.25, 0.5, 0.25]])
+    min_overlaps = np.stack([overlap_0_7, overlap_0_5], axis=0)  # [2, 3, 5]
+    class_to_name = {
+        0: 'Car',
+        1: 'Pedestrian',
+        2: 'Cyclist',
+        3: 'Van',
+        4: 'Person_sitting',
+    }
+    name_to_class = {v: n for n, v in class_to_name.items()}
+    if not isinstance(current_classes, (list, tuple)):
+        current_classes = [current_classes]
+    current_classes_int = []
+    for curcls in current_classes:
+        if isinstance(curcls, str):
+            current_classes_int.append(name_to_class[curcls])
+        else:
+            current_classes_int.append(curcls)
+    current_classes = current_classes_int
+    min_overlaps = min_overlaps[:, :, current_classes]
+    result = ''
+    # check whether alpha is valid
+    compute_aos = False
+    pred_alpha = False
+    valid_alpha_gt = False
+    for anno in dt_annos:
+        mask = (anno['alpha'] != -10)
+        if anno['alpha'][mask].shape[0] != 0:
+            pred_alpha = True
+            break
+    for anno in gt_annos:
+        if anno['alpha'][0] != -10:
+            valid_alpha_gt = True
+            break
+    compute_aos = (pred_alpha and valid_alpha_gt)
+    if compute_aos:
+        eval_types.append('aos')
+
+    mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox, mAP40_bev, \
+        mAP40_3d, mAP40_aos = do_eval(gt_annos, dt_annos,
+                                      current_classes, min_overlaps,
+                                      eval_types)
+
+    ret_dict = {}
+    difficulty = ['easy', 'moderate', 'hard']
+
+    # calculate AP11
+    result += '\n----------- AP11 Results ------------\n\n'
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        curcls_name = class_to_name[curcls]
+        for i in range(min_overlaps.shape[0]):
+            # prepare results for print
+            result += ('{} AP11@{:.2f}, {:.2f}, {:.2f}:\n'.format(
+                curcls_name, *min_overlaps[i, :, j]))
+            if mAP11_bbox is not None:
+                result += 'bbox AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP11_bbox[j, :, i])
+            if mAP11_bev is not None:
+                result += 'bev  AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP11_bev[j, :, i])
+            if mAP11_3d is not None:
+                result += '3d   AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP11_3d[j, :, i])
+            if compute_aos:
+                result += 'aos  AP11:{:.2f}, {:.2f}, {:.2f}\n'.format(
+                    *mAP11_aos[j, :, i])
+
+            # prepare results for logger
+            for idx in range(3):
+                if i == 0:
+                    postfix = f'{difficulty[idx]}_strict'
+                else:
+                    postfix = f'{difficulty[idx]}_loose'
+                prefix = f'KITTI/{curcls_name}'
+                if mAP11_3d is not None:
+                    ret_dict[f'{prefix}_3D_AP11_{postfix}'] =\
+                        mAP11_3d[j, idx, i]
+                if mAP11_bev is not None:
+                    ret_dict[f'{prefix}_BEV_AP11_{postfix}'] =\
+                        mAP11_bev[j, idx, i]
+                if mAP11_bbox is not None:
+                    ret_dict[f'{prefix}_2D_AP11_{postfix}'] =\
+                        mAP11_bbox[j, idx, i]
+
+    # calculate mAP11 over all classes if there are multiple classes
+    if len(current_classes) > 1:
+        # prepare results for print
+        result += ('\nOverall AP11@{}, {}, {}:\n'.format(*difficulty))
+        if mAP11_bbox is not None:
+            mAP11_bbox = mAP11_bbox.mean(axis=0)
+            result += 'bbox AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                *mAP11_bbox[:, 0])
+        if mAP11_bev is not None:
+            mAP11_bev = mAP11_bev.mean(axis=0)
+            result += 'bev  AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                *mAP11_bev[:, 0])
+        if mAP11_3d is not None:
+            mAP11_3d = mAP11_3d.mean(axis=0)
+            result += '3d   AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP11_3d[:,
+                                                                            0])
+        if compute_aos:
+            mAP11_aos = mAP11_aos.mean(axis=0)
+            result += 'aos  AP11:{:.2f}, {:.2f}, {:.2f}\n'.format(
+                *mAP11_aos[:, 0])
+
+        # prepare results for logger
+        for idx in range(3):
+            postfix = f'{difficulty[idx]}'
+            if mAP11_3d is not None:
+                ret_dict[f'KITTI/Overall_3D_AP11_{postfix}'] = mAP11_3d[idx, 0]
+            if mAP11_bev is not None:
+                ret_dict[f'KITTI/Overall_BEV_AP11_{postfix}'] =\
+                    mAP11_bev[idx, 0]
+            if mAP11_bbox is not None:
+                ret_dict[f'KITTI/Overall_2D_AP11_{postfix}'] =\
+                    mAP11_bbox[idx, 0]
+
+    # Calculate AP40
+    result += '\n----------- AP40 Results ------------\n\n'
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        curcls_name = class_to_name[curcls]
+        for i in range(min_overlaps.shape[0]):
+            # prepare results for print
+            result += ('{} AP40@{:.2f}, {:.2f}, {:.2f}:\n'.format(
+                curcls_name, *min_overlaps[i, :, j]))
+            if mAP40_bbox is not None:
+                result += 'bbox AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP40_bbox[j, :, i])
+            if mAP40_bev is not None:
+                result += 'bev  AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP40_bev[j, :, i])
+            if mAP40_3d is not None:
+                result += '3d   AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP40_3d[j, :, i])
+            if compute_aos:
+                result += 'aos  AP40:{:.2f}, {:.2f}, {:.2f}\n'.format(
+                    *mAP40_aos[j, :, i])
+
+            # prepare results for logger
+            for idx in range(3):
+                if i == 0:
+                    postfix = f'{difficulty[idx]}_strict'
+                else:
+                    postfix = f'{difficulty[idx]}_loose'
+                prefix = f'KITTI/{curcls_name}'
+                if mAP40_3d is not None:
+                    ret_dict[f'{prefix}_3D_AP40_{postfix}'] =\
+                        mAP40_3d[j, idx, i]
+                if mAP40_bev is not None:
+                    ret_dict[f'{prefix}_BEV_AP40_{postfix}'] =\
+                        mAP40_bev[j, idx, i]
+                if mAP40_bbox is not None:
+                    ret_dict[f'{prefix}_2D_AP40_{postfix}'] =\
+                        mAP40_bbox[j, idx, i]
+
+    # calculate mAP40 over all classes if there are multiple classes
+    if len(current_classes) > 1:
+        # prepare results for print
+        result += ('\nOverall AP40@{}, {}, {}:\n'.format(*difficulty))
+        if mAP40_bbox is not None:
+            mAP40_bbox = mAP40_bbox.mean(axis=0)
+            result += 'bbox AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                *mAP40_bbox[:, 0])
+        if mAP40_bev is not None:
+            mAP40_bev = mAP40_bev.mean(axis=0)
+            result += 'bev  AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                *mAP40_bev[:, 0])
+        if mAP40_3d is not None:
+            mAP40_3d = mAP40_3d.mean(axis=0)
+            result += '3d   AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP40_3d[:,
+                                                                            0])
+        if compute_aos:
+            mAP40_aos = mAP40_aos.mean(axis=0)
+            result += 'aos  AP40:{:.2f}, {:.2f}, {:.2f}\n'.format(
+                *mAP40_aos[:, 0])
+
+        # prepare results for logger
+        for idx in range(3):
+            postfix = f'{difficulty[idx]}'
+            if mAP40_3d is not None:
+                ret_dict[f'KITTI/Overall_3D_AP40_{postfix}'] = mAP40_3d[idx, 0]
+            if mAP40_bev is not None:
+                ret_dict[f'KITTI/Overall_BEV_AP40_{postfix}'] =\
+                    mAP40_bev[idx, 0]
+            if mAP40_bbox is not None:
+                ret_dict[f'KITTI/Overall_2D_AP40_{postfix}'] =\
+                    mAP40_bbox[idx, 0]
+
+    return result, ret_dict
+
+
+def kitti_eval_coco_style(gt_annos, dt_annos, current_classes):
+    """coco style evaluation of kitti.
+
+    Args:
+        gt_annos (list[dict]): Contain gt information of each sample.
+        dt_annos (list[dict]): Contain detected information of each sample.
+        current_classes (list[str]): Classes to evaluation.
+
+    Returns:
+        string: Evaluation results.
+    """
+    class_to_name = {
+        0: 'Car',
+        1: 'Pedestrian',
+        2: 'Cyclist',
+        3: 'Van',
+        4: 'Person_sitting',
+    }
+    class_to_range = {
+        0: [0.5, 0.95, 10],
+        1: [0.25, 0.7, 10],
+        2: [0.25, 0.7, 10],
+        3: [0.5, 0.95, 10],
+        4: [0.25, 0.7, 10],
+    }
+    name_to_class = {v: n for n, v in class_to_name.items()}
+    if not isinstance(current_classes, (list, tuple)):
+        current_classes = [current_classes]
+    current_classes_int = []
+    for curcls in current_classes:
+        if isinstance(curcls, str):
+            current_classes_int.append(name_to_class[curcls])
+        else:
+            current_classes_int.append(curcls)
+    current_classes = current_classes_int
+    overlap_ranges = np.zeros([3, 3, len(current_classes)])
+    for i, curcls in enumerate(current_classes):
+        overlap_ranges[:, :, i] = np.array(class_to_range[curcls])[:,
+                                                                   np.newaxis]
+    result = ''
+    # check whether alpha is valid
+    compute_aos = False
+    for anno in dt_annos:
+        if anno['alpha'].shape[0] != 0:
+            if anno['alpha'][0] != -10:
+                compute_aos = True
+            break
+    mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval(
+        gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos)
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        o_range = np.array(class_to_range[curcls])[[0, 2, 1]]
+        o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1)
+        result += print_str((f'{class_to_name[curcls]} '
+                             'coco AP@{:.2f}:{:.2f}:{:.2f}:'.format(*o_range)))
+        result += print_str((f'bbox AP:{mAPbbox[j, 0]:.2f}, '
+                             f'{mAPbbox[j, 1]:.2f}, '
+                             f'{mAPbbox[j, 2]:.2f}'))
+        result += print_str((f'bev  AP:{mAPbev[j, 0]:.2f}, '
+                             f'{mAPbev[j, 1]:.2f}, '
+                             f'{mAPbev[j, 2]:.2f}'))
+        result += print_str((f'3d   AP:{mAP3d[j, 0]:.2f}, '
+                             f'{mAP3d[j, 1]:.2f}, '
+                             f'{mAP3d[j, 2]:.2f}'))
+        if compute_aos:
+            result += print_str((f'aos  AP:{mAPaos[j, 0]:.2f}, '
+                                 f'{mAPaos[j, 1]:.2f}, '
+                                 f'{mAPaos[j, 2]:.2f}'))
+    return result
diff --git a/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py b/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py
index 9ed75bf..b4d39da 100644
--- a/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py
+++ b/mmdet3d/core/evaluation/kitti_utils/rotate_iou.py
@@ -1,379 +1,379 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-#####################
-# Based on https://github.com/hongzhenwang/RRPN-revise
-# Licensed under The MIT License
-# Author: yanyan, scrin@foxmail.com
-#####################
-import math
-
-import numba
-import numpy as np
-from numba import cuda
-
-
-@numba.jit(nopython=True)
-def div_up(m, n):
-    return m // n + (m % n > 0)
-
-
-@cuda.jit(device=True, inline=True)
-def trangle_area(a, b, c):
-    return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) *
-            (b[0] - c[0])) / 2.0
-
-
-@cuda.jit(device=True, inline=True)
-def area(int_pts, num_of_inter):
-    area_val = 0.0
-    for i in range(num_of_inter - 2):
-        area_val += abs(
-            trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4],
-                         int_pts[2 * i + 4:2 * i + 6]))
-    return area_val
-
-
-@cuda.jit(device=True, inline=True)
-def sort_vertex_in_convex_polygon(int_pts, num_of_inter):
-    if num_of_inter > 0:
-        center = cuda.local.array((2, ), dtype=numba.float32)
-        center[:] = 0.0
-        for i in range(num_of_inter):
-            center[0] += int_pts[2 * i]
-            center[1] += int_pts[2 * i + 1]
-        center[0] /= num_of_inter
-        center[1] /= num_of_inter
-        v = cuda.local.array((2, ), dtype=numba.float32)
-        vs = cuda.local.array((16, ), dtype=numba.float32)
-        for i in range(num_of_inter):
-            v[0] = int_pts[2 * i] - center[0]
-            v[1] = int_pts[2 * i + 1] - center[1]
-            d = math.sqrt(v[0] * v[0] + v[1] * v[1])
-            v[0] = v[0] / d
-            v[1] = v[1] / d
-            if v[1] < 0:
-                v[0] = -2 - v[0]
-            vs[i] = v[0]
-        j = 0
-        temp = 0
-        for i in range(1, num_of_inter):
-            if vs[i - 1] > vs[i]:
-                temp = vs[i]
-                tx = int_pts[2 * i]
-                ty = int_pts[2 * i + 1]
-                j = i
-                while j > 0 and vs[j - 1] > temp:
-                    vs[j] = vs[j - 1]
-                    int_pts[j * 2] = int_pts[j * 2 - 2]
-                    int_pts[j * 2 + 1] = int_pts[j * 2 - 1]
-                    j -= 1
-
-                vs[j] = temp
-                int_pts[j * 2] = tx
-                int_pts[j * 2 + 1] = ty
-
-
-@cuda.jit(device=True, inline=True)
-def line_segment_intersection(pts1, pts2, i, j, temp_pts):
-    A = cuda.local.array((2, ), dtype=numba.float32)
-    B = cuda.local.array((2, ), dtype=numba.float32)
-    C = cuda.local.array((2, ), dtype=numba.float32)
-    D = cuda.local.array((2, ), dtype=numba.float32)
-
-    A[0] = pts1[2 * i]
-    A[1] = pts1[2 * i + 1]
-
-    B[0] = pts1[2 * ((i + 1) % 4)]
-    B[1] = pts1[2 * ((i + 1) % 4) + 1]
-
-    C[0] = pts2[2 * j]
-    C[1] = pts2[2 * j + 1]
-
-    D[0] = pts2[2 * ((j + 1) % 4)]
-    D[1] = pts2[2 * ((j + 1) % 4) + 1]
-    BA0 = B[0] - A[0]
-    BA1 = B[1] - A[1]
-    DA0 = D[0] - A[0]
-    CA0 = C[0] - A[0]
-    DA1 = D[1] - A[1]
-    CA1 = C[1] - A[1]
-    acd = DA1 * CA0 > CA1 * DA0
-    bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0])
-    if acd != bcd:
-        abc = CA1 * BA0 > BA1 * CA0
-        abd = DA1 * BA0 > BA1 * DA0
-        if abc != abd:
-            DC0 = D[0] - C[0]
-            DC1 = D[1] - C[1]
-            ABBA = A[0] * B[1] - B[0] * A[1]
-            CDDC = C[0] * D[1] - D[0] * C[1]
-            DH = BA1 * DC0 - BA0 * DC1
-            Dx = ABBA * DC0 - BA0 * CDDC
-            Dy = ABBA * DC1 - BA1 * CDDC
-            temp_pts[0] = Dx / DH
-            temp_pts[1] = Dy / DH
-            return True
-    return False
-
-
-@cuda.jit(device=True, inline=True)
-def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts):
-    a = cuda.local.array((2, ), dtype=numba.float32)
-    b = cuda.local.array((2, ), dtype=numba.float32)
-    c = cuda.local.array((2, ), dtype=numba.float32)
-    d = cuda.local.array((2, ), dtype=numba.float32)
-
-    a[0] = pts1[2 * i]
-    a[1] = pts1[2 * i + 1]
-
-    b[0] = pts1[2 * ((i + 1) % 4)]
-    b[1] = pts1[2 * ((i + 1) % 4) + 1]
-
-    c[0] = pts2[2 * j]
-    c[1] = pts2[2 * j + 1]
-
-    d[0] = pts2[2 * ((j + 1) % 4)]
-    d[1] = pts2[2 * ((j + 1) % 4) + 1]
-
-    area_abc = trangle_area(a, b, c)
-    area_abd = trangle_area(a, b, d)
-
-    if area_abc * area_abd >= 0:
-        return False
-
-    area_cda = trangle_area(c, d, a)
-    area_cdb = area_cda + area_abc - area_abd
-
-    if area_cda * area_cdb >= 0:
-        return False
-    t = area_cda / (area_abd - area_abc)
-
-    dx = t * (b[0] - a[0])
-    dy = t * (b[1] - a[1])
-    temp_pts[0] = a[0] + dx
-    temp_pts[1] = a[1] + dy
-    return True
-
-
-@cuda.jit(device=True, inline=True)
-def point_in_quadrilateral(pt_x, pt_y, corners):
-    ab0 = corners[2] - corners[0]
-    ab1 = corners[3] - corners[1]
-
-    ad0 = corners[6] - corners[0]
-    ad1 = corners[7] - corners[1]
-
-    ap0 = pt_x - corners[0]
-    ap1 = pt_y - corners[1]
-
-    abab = ab0 * ab0 + ab1 * ab1
-    abap = ab0 * ap0 + ab1 * ap1
-    adad = ad0 * ad0 + ad1 * ad1
-    adap = ad0 * ap0 + ad1 * ap1
-
-    return abab >= abap and abap >= 0 and adad >= adap and adap >= 0
-
-
-@cuda.jit(device=True, inline=True)
-def quadrilateral_intersection(pts1, pts2, int_pts):
-    num_of_inter = 0
-    for i in range(4):
-        if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2):
-            int_pts[num_of_inter * 2] = pts1[2 * i]
-            int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1]
-            num_of_inter += 1
-        if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1):
-            int_pts[num_of_inter * 2] = pts2[2 * i]
-            int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1]
-            num_of_inter += 1
-    temp_pts = cuda.local.array((2, ), dtype=numba.float32)
-    for i in range(4):
-        for j in range(4):
-            has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts)
-            if has_pts:
-                int_pts[num_of_inter * 2] = temp_pts[0]
-                int_pts[num_of_inter * 2 + 1] = temp_pts[1]
-                num_of_inter += 1
-
-    return num_of_inter
-
-
-@cuda.jit(device=True, inline=True)
-def rbbox_to_corners(corners, rbbox):
-    # generate clockwise corners and rotate it clockwise
-    angle = rbbox[4]
-    a_cos = math.cos(angle)
-    a_sin = math.sin(angle)
-    center_x = rbbox[0]
-    center_y = rbbox[1]
-    x_d = rbbox[2]
-    y_d = rbbox[3]
-    corners_x = cuda.local.array((4, ), dtype=numba.float32)
-    corners_y = cuda.local.array((4, ), dtype=numba.float32)
-    corners_x[0] = -x_d / 2
-    corners_x[1] = -x_d / 2
-    corners_x[2] = x_d / 2
-    corners_x[3] = x_d / 2
-    corners_y[0] = -y_d / 2
-    corners_y[1] = y_d / 2
-    corners_y[2] = y_d / 2
-    corners_y[3] = -y_d / 2
-    for i in range(4):
-        corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x
-        corners[2 * i +
-                1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y
-
-
-@cuda.jit(device=True, inline=True)
-def inter(rbbox1, rbbox2):
-    """Compute intersection of two rotated boxes.
-
-    Args:
-        rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
-        rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
-
-    Returns:
-        float: Intersection of two rotated boxes.
-    """
-    corners1 = cuda.local.array((8, ), dtype=numba.float32)
-    corners2 = cuda.local.array((8, ), dtype=numba.float32)
-    intersection_corners = cuda.local.array((16, ), dtype=numba.float32)
-
-    rbbox_to_corners(corners1, rbbox1)
-    rbbox_to_corners(corners2, rbbox2)
-
-    num_intersection = quadrilateral_intersection(corners1, corners2,
-                                                  intersection_corners)
-    sort_vertex_in_convex_polygon(intersection_corners, num_intersection)
-    # print(intersection_corners.reshape([-1, 2])[:num_intersection])
-
-    return area(intersection_corners, num_intersection)
-
-
-@cuda.jit(device=True, inline=True)
-def devRotateIoUEval(rbox1, rbox2, criterion=-1):
-    """Compute rotated iou on device.
-
-    Args:
-        rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
-        rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
-        criterion (int, optional): Indicate different type of iou.
-            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
-            0 indicate `area_inter / area1`,
-            1 indicate `area_inter / area2`.
-
-    Returns:
-        float: iou between two input boxes.
-    """
-    area1 = rbox1[2] * rbox1[3]
-    area2 = rbox2[2] * rbox2[3]
-    area_inter = inter(rbox1, rbox2)
-    if criterion == -1:
-        return area_inter / (area1 + area2 - area_inter)
-    elif criterion == 0:
-        return area_inter / area1
-    elif criterion == 1:
-        return area_inter / area2
-    else:
-        return area_inter
-
-
-@cuda.jit(
-    '(int64, int64, float32[:], float32[:], float32[:], int32)',
-    fastmath=False)
-def rotate_iou_kernel_eval(N,
-                           K,
-                           dev_boxes,
-                           dev_query_boxes,
-                           dev_iou,
-                           criterion=-1):
-    """Kernel of computing rotated IoU. This function is for bev boxes in
-    camera coordinate system ONLY (the rotation is clockwise).
-
-    Args:
-        N (int): The number of boxes.
-        K (int): The number of query boxes.
-        dev_boxes (np.ndarray): Boxes on device.
-        dev_query_boxes (np.ndarray): Query boxes on device.
-        dev_iou (np.ndarray): Computed iou to return.
-        criterion (int, optional): Indicate different type of iou.
-            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
-            0 indicate `area_inter / area1`,
-            1 indicate `area_inter / area2`.
-    """
-    threadsPerBlock = 8 * 8
-    row_start = cuda.blockIdx.x
-    col_start = cuda.blockIdx.y
-    tx = cuda.threadIdx.x
-    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
-    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
-    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
-    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
-
-    dev_query_box_idx = threadsPerBlock * col_start + tx
-    dev_box_idx = threadsPerBlock * row_start + tx
-    if (tx < col_size):
-        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
-        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
-        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
-        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
-        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
-    if (tx < row_size):
-        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
-        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
-        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
-        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
-        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
-    cuda.syncthreads()
-    if tx < row_size:
-        for i in range(col_size):
-            offset = (
-                row_start * threadsPerBlock * K + col_start * threadsPerBlock +
-                tx * K + i)
-            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
-                                               block_boxes[tx * 5:tx * 5 + 5],
-                                               criterion)
-
-
-def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
-    """Rotated box iou running in gpu. 500x faster than cpu version (take 5ms
-    in one example with numba.cuda code). convert from [this project](
-    https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
-
-    This function is for bev boxes in camera coordinate system ONLY
-    (the rotation is clockwise).
-
-    Args:
-        boxes (torch.Tensor): rbboxes. format: centers, dims,
-            angles(clockwise when positive) with the shape of [N, 5].
-        query_boxes (torch.FloatTensor, shape=(K, 5)):
-            rbboxes to compute iou with boxes.
-        device_id (int, optional): Defaults to 0. Device to use.
-        criterion (int, optional): Indicate different type of iou.
-            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
-            0 indicate `area_inter / area1`,
-            1 indicate `area_inter / area2`.
-
-    Returns:
-        np.ndarray: IoU results.
-    """
-    boxes = boxes.astype(np.float32)
-    query_boxes = query_boxes.astype(np.float32)
-    N = boxes.shape[0]
-    K = query_boxes.shape[0]
-    iou = np.zeros((N, K), dtype=np.float32)
-    if N == 0 or K == 0:
-        return iou
-    threadsPerBlock = 8 * 8
-    cuda.select_device(device_id)
-    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
-
-    stream = cuda.stream()
-    with stream.auto_synchronize():
-        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
-        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
-        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
-        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock,
-                               stream](N, K, boxes_dev, query_boxes_dev,
-                                       iou_dev, criterion)
-        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
-    return iou.astype(boxes.dtype)
+# Copyright (c) OpenMMLab. All rights reserved.
+#####################
+# Based on https://github.com/hongzhenwang/RRPN-revise
+# Licensed under The MIT License
+# Author: yanyan, scrin@foxmail.com
+#####################
+import math
+
+import numba
+import numpy as np
+from numba import cuda
+
+
+@numba.jit(nopython=True)
+def div_up(m, n):
+    return m // n + (m % n > 0)
+
+
+@cuda.jit(device=True, inline=True)
+def trangle_area(a, b, c):
+    return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) *
+            (b[0] - c[0])) / 2.0
+
+
+@cuda.jit(device=True, inline=True)
+def area(int_pts, num_of_inter):
+    area_val = 0.0
+    for i in range(num_of_inter - 2):
+        area_val += abs(
+            trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4],
+                         int_pts[2 * i + 4:2 * i + 6]))
+    return area_val
+
+
+@cuda.jit(device=True, inline=True)
+def sort_vertex_in_convex_polygon(int_pts, num_of_inter):
+    if num_of_inter > 0:
+        center = cuda.local.array((2, ), dtype=numba.float32)
+        center[:] = 0.0
+        for i in range(num_of_inter):
+            center[0] += int_pts[2 * i]
+            center[1] += int_pts[2 * i + 1]
+        center[0] /= num_of_inter
+        center[1] /= num_of_inter
+        v = cuda.local.array((2, ), dtype=numba.float32)
+        vs = cuda.local.array((16, ), dtype=numba.float32)
+        for i in range(num_of_inter):
+            v[0] = int_pts[2 * i] - center[0]
+            v[1] = int_pts[2 * i + 1] - center[1]
+            d = math.sqrt(v[0] * v[0] + v[1] * v[1])
+            v[0] = v[0] / d
+            v[1] = v[1] / d
+            if v[1] < 0:
+                v[0] = -2 - v[0]
+            vs[i] = v[0]
+        j = 0
+        temp = 0
+        for i in range(1, num_of_inter):
+            if vs[i - 1] > vs[i]:
+                temp = vs[i]
+                tx = int_pts[2 * i]
+                ty = int_pts[2 * i + 1]
+                j = i
+                while j > 0 and vs[j - 1] > temp:
+                    vs[j] = vs[j - 1]
+                    int_pts[j * 2] = int_pts[j * 2 - 2]
+                    int_pts[j * 2 + 1] = int_pts[j * 2 - 1]
+                    j -= 1
+
+                vs[j] = temp
+                int_pts[j * 2] = tx
+                int_pts[j * 2 + 1] = ty
+
+
+@cuda.jit(device=True, inline=True)
+def line_segment_intersection(pts1, pts2, i, j, temp_pts):
+    A = cuda.local.array((2, ), dtype=numba.float32)
+    B = cuda.local.array((2, ), dtype=numba.float32)
+    C = cuda.local.array((2, ), dtype=numba.float32)
+    D = cuda.local.array((2, ), dtype=numba.float32)
+
+    A[0] = pts1[2 * i]
+    A[1] = pts1[2 * i + 1]
+
+    B[0] = pts1[2 * ((i + 1) % 4)]
+    B[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    C[0] = pts2[2 * j]
+    C[1] = pts2[2 * j + 1]
+
+    D[0] = pts2[2 * ((j + 1) % 4)]
+    D[1] = pts2[2 * ((j + 1) % 4) + 1]
+    BA0 = B[0] - A[0]
+    BA1 = B[1] - A[1]
+    DA0 = D[0] - A[0]
+    CA0 = C[0] - A[0]
+    DA1 = D[1] - A[1]
+    CA1 = C[1] - A[1]
+    acd = DA1 * CA0 > CA1 * DA0
+    bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0])
+    if acd != bcd:
+        abc = CA1 * BA0 > BA1 * CA0
+        abd = DA1 * BA0 > BA1 * DA0
+        if abc != abd:
+            DC0 = D[0] - C[0]
+            DC1 = D[1] - C[1]
+            ABBA = A[0] * B[1] - B[0] * A[1]
+            CDDC = C[0] * D[1] - D[0] * C[1]
+            DH = BA1 * DC0 - BA0 * DC1
+            Dx = ABBA * DC0 - BA0 * CDDC
+            Dy = ABBA * DC1 - BA1 * CDDC
+            temp_pts[0] = Dx / DH
+            temp_pts[1] = Dy / DH
+            return True
+    return False
+
+
+@cuda.jit(device=True, inline=True)
+def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts):
+    a = cuda.local.array((2, ), dtype=numba.float32)
+    b = cuda.local.array((2, ), dtype=numba.float32)
+    c = cuda.local.array((2, ), dtype=numba.float32)
+    d = cuda.local.array((2, ), dtype=numba.float32)
+
+    a[0] = pts1[2 * i]
+    a[1] = pts1[2 * i + 1]
+
+    b[0] = pts1[2 * ((i + 1) % 4)]
+    b[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    c[0] = pts2[2 * j]
+    c[1] = pts2[2 * j + 1]
+
+    d[0] = pts2[2 * ((j + 1) % 4)]
+    d[1] = pts2[2 * ((j + 1) % 4) + 1]
+
+    area_abc = trangle_area(a, b, c)
+    area_abd = trangle_area(a, b, d)
+
+    if area_abc * area_abd >= 0:
+        return False
+
+    area_cda = trangle_area(c, d, a)
+    area_cdb = area_cda + area_abc - area_abd
+
+    if area_cda * area_cdb >= 0:
+        return False
+    t = area_cda / (area_abd - area_abc)
+
+    dx = t * (b[0] - a[0])
+    dy = t * (b[1] - a[1])
+    temp_pts[0] = a[0] + dx
+    temp_pts[1] = a[1] + dy
+    return True
+
+
+@cuda.jit(device=True, inline=True)
+def point_in_quadrilateral(pt_x, pt_y, corners):
+    ab0 = corners[2] - corners[0]
+    ab1 = corners[3] - corners[1]
+
+    ad0 = corners[6] - corners[0]
+    ad1 = corners[7] - corners[1]
+
+    ap0 = pt_x - corners[0]
+    ap1 = pt_y - corners[1]
+
+    abab = ab0 * ab0 + ab1 * ab1
+    abap = ab0 * ap0 + ab1 * ap1
+    adad = ad0 * ad0 + ad1 * ad1
+    adap = ad0 * ap0 + ad1 * ap1
+
+    return abab >= abap and abap >= 0 and adad >= adap and adap >= 0
+
+
+@cuda.jit(device=True, inline=True)
+def quadrilateral_intersection(pts1, pts2, int_pts):
+    num_of_inter = 0
+    for i in range(4):
+        if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2):
+            int_pts[num_of_inter * 2] = pts1[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1]
+            num_of_inter += 1
+        if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1):
+            int_pts[num_of_inter * 2] = pts2[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1]
+            num_of_inter += 1
+    temp_pts = cuda.local.array((2, ), dtype=numba.float32)
+    for i in range(4):
+        for j in range(4):
+            has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts)
+            if has_pts:
+                int_pts[num_of_inter * 2] = temp_pts[0]
+                int_pts[num_of_inter * 2 + 1] = temp_pts[1]
+                num_of_inter += 1
+
+    return num_of_inter
+
+
+@cuda.jit(device=True, inline=True)
+def rbbox_to_corners(corners, rbbox):
+    # generate clockwise corners and rotate it clockwise
+    angle = rbbox[4]
+    a_cos = math.cos(angle)
+    a_sin = math.sin(angle)
+    center_x = rbbox[0]
+    center_y = rbbox[1]
+    x_d = rbbox[2]
+    y_d = rbbox[3]
+    corners_x = cuda.local.array((4, ), dtype=numba.float32)
+    corners_y = cuda.local.array((4, ), dtype=numba.float32)
+    corners_x[0] = -x_d / 2
+    corners_x[1] = -x_d / 2
+    corners_x[2] = x_d / 2
+    corners_x[3] = x_d / 2
+    corners_y[0] = -y_d / 2
+    corners_y[1] = y_d / 2
+    corners_y[2] = y_d / 2
+    corners_y[3] = -y_d / 2
+    for i in range(4):
+        corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x
+        corners[2 * i +
+                1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y
+
+
+@cuda.jit(device=True, inline=True)
+def inter(rbbox1, rbbox2):
+    """Compute intersection of two rotated boxes.
+
+    Args:
+        rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
+        rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
+
+    Returns:
+        float: Intersection of two rotated boxes.
+    """
+    corners1 = cuda.local.array((8, ), dtype=numba.float32)
+    corners2 = cuda.local.array((8, ), dtype=numba.float32)
+    intersection_corners = cuda.local.array((16, ), dtype=numba.float32)
+
+    rbbox_to_corners(corners1, rbbox1)
+    rbbox_to_corners(corners2, rbbox2)
+
+    num_intersection = quadrilateral_intersection(corners1, corners2,
+                                                  intersection_corners)
+    sort_vertex_in_convex_polygon(intersection_corners, num_intersection)
+    # print(intersection_corners.reshape([-1, 2])[:num_intersection])
+
+    return area(intersection_corners, num_intersection)
+
+
+@cuda.jit(device=True, inline=True)
+def devRotateIoUEval(rbox1, rbox2, criterion=-1):
+    """Compute rotated iou on device.
+
+    Args:
+        rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
+        rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
+        criterion (int, optional): Indicate different type of iou.
+            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+            0 indicate `area_inter / area1`,
+            1 indicate `area_inter / area2`.
+
+    Returns:
+        float: iou between two input boxes.
+    """
+    area1 = rbox1[2] * rbox1[3]
+    area2 = rbox2[2] * rbox2[3]
+    area_inter = inter(rbox1, rbox2)
+    if criterion == -1:
+        return area_inter / (area1 + area2 - area_inter)
+    elif criterion == 0:
+        return area_inter / area1
+    elif criterion == 1:
+        return area_inter / area2
+    else:
+        return area_inter
+
+
+@cuda.jit(
+    '(int64, int64, float32[:], float32[:], float32[:], int32)',
+    fastmath=False)
+def rotate_iou_kernel_eval(N,
+                           K,
+                           dev_boxes,
+                           dev_query_boxes,
+                           dev_iou,
+                           criterion=-1):
+    """Kernel of computing rotated IoU. This function is for bev boxes in
+    camera coordinate system ONLY (the rotation is clockwise).
+
+    Args:
+        N (int): The number of boxes.
+        K (int): The number of query boxes.
+        dev_boxes (np.ndarray): Boxes on device.
+        dev_query_boxes (np.ndarray): Query boxes on device.
+        dev_iou (np.ndarray): Computed iou to return.
+        criterion (int, optional): Indicate different type of iou.
+            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+            0 indicate `area_inter / area1`,
+            1 indicate `area_inter / area2`.
+    """
+    threadsPerBlock = 8 * 8
+    row_start = cuda.blockIdx.x
+    col_start = cuda.blockIdx.y
+    tx = cuda.threadIdx.x
+    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
+    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
+    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+
+    dev_query_box_idx = threadsPerBlock * col_start + tx
+    dev_box_idx = threadsPerBlock * row_start + tx
+    if (tx < col_size):
+        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
+        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
+        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
+        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
+        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
+    if (tx < row_size):
+        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
+        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
+        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
+        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
+        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
+    cuda.syncthreads()
+    if tx < row_size:
+        for i in range(col_size):
+            offset = (
+                row_start * threadsPerBlock * K + col_start * threadsPerBlock +
+                tx * K + i)
+            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
+                                               block_boxes[tx * 5:tx * 5 + 5],
+                                               criterion)
+
+
+def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
+    """Rotated box iou running in gpu. 500x faster than cpu version (take 5ms
+    in one example with numba.cuda code). convert from [this project](
+    https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
+
+    This function is for bev boxes in camera coordinate system ONLY
+    (the rotation is clockwise).
+
+    Args:
+        boxes (torch.Tensor): rbboxes. format: centers, dims,
+            angles(clockwise when positive) with the shape of [N, 5].
+        query_boxes (torch.FloatTensor, shape=(K, 5)):
+            rbboxes to compute iou with boxes.
+        device_id (int, optional): Defaults to 0. Device to use.
+        criterion (int, optional): Indicate different type of iou.
+            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+            0 indicate `area_inter / area1`,
+            1 indicate `area_inter / area2`.
+
+    Returns:
+        np.ndarray: IoU results.
+    """
+    boxes = boxes.astype(np.float32)
+    query_boxes = query_boxes.astype(np.float32)
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    iou = np.zeros((N, K), dtype=np.float32)
+    if N == 0 or K == 0:
+        return iou
+    threadsPerBlock = 8 * 8
+    cuda.select_device(device_id)
+    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
+
+    stream = cuda.stream()
+    with stream.auto_synchronize():
+        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
+        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
+        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
+        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock,
+                               stream](N, K, boxes_dev, query_boxes_dev,
+                                       iou_dev, criterion)
+        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
+    return iou.astype(boxes.dtype)
diff --git a/mmdet3d/core/evaluation/lyft_eval.py b/mmdet3d/core/evaluation/lyft_eval.py
index 47c5cd6..0f8679c 100644
--- a/mmdet3d/core/evaluation/lyft_eval.py
+++ b/mmdet3d/core/evaluation/lyft_eval.py
@@ -1,285 +1,285 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from os import path as osp
-
-import mmcv
-import numpy as np
-from lyft_dataset_sdk.eval.detection.mAP_evaluation import (Box3D, get_ap,
-                                                            get_class_names,
-                                                            get_ious,
-                                                            group_by_key,
-                                                            wrap_in_box)
-from mmcv.utils import print_log
-from terminaltables import AsciiTable
-
-
-def load_lyft_gts(lyft, data_root, eval_split, logger=None):
-    """Loads ground truth boxes from database.
-
-    Args:
-        lyft (:obj:`LyftDataset`): Lyft class in the sdk.
-        data_root (str): Root of data for reading splits.
-        eval_split (str): Name of the split for evaluation.
-        logger (logging.Logger | str, optional): Logger used for printing
-        related information during evaluation. Default: None.
-
-    Returns:
-        list[dict]: List of annotation dictionaries.
-    """
-    split_scenes = mmcv.list_from_file(
-        osp.join(data_root, f'{eval_split}.txt'))
-
-    # Read out all sample_tokens in DB.
-    sample_tokens_all = [s['token'] for s in lyft.sample]
-    assert len(sample_tokens_all) > 0, 'Error: Database has no samples!'
-
-    if eval_split == 'test':
-        # Check that you aren't trying to cheat :)
-        assert len(lyft.sample_annotation) > 0, \
-            'Error: You are trying to evaluate on the test set \
-             but you do not have the annotations!'
-
-    sample_tokens = []
-    for sample_token in sample_tokens_all:
-        scene_token = lyft.get('sample', sample_token)['scene_token']
-        scene_record = lyft.get('scene', scene_token)
-        if scene_record['name'] in split_scenes:
-            sample_tokens.append(sample_token)
-
-    all_annotations = []
-
-    print_log('Loading ground truth annotations...', logger=logger)
-    # Load annotations and filter predictions and annotations.
-    for sample_token in mmcv.track_iter_progress(sample_tokens):
-        sample = lyft.get('sample', sample_token)
-        sample_annotation_tokens = sample['anns']
-        for sample_annotation_token in sample_annotation_tokens:
-            # Get label name in detection task and filter unused labels.
-            sample_annotation = \
-                lyft.get('sample_annotation', sample_annotation_token)
-            detection_name = sample_annotation['category_name']
-            if detection_name is None:
-                continue
-            annotation = {
-                'sample_token': sample_token,
-                'translation': sample_annotation['translation'],
-                'size': sample_annotation['size'],
-                'rotation': sample_annotation['rotation'],
-                'name': detection_name,
-            }
-            all_annotations.append(annotation)
-
-    return all_annotations
-
-
-def load_lyft_predictions(res_path):
-    """Load Lyft predictions from json file.
-
-    Args:
-        res_path (str): Path of result json file recording detections.
-
-    Returns:
-        list[dict]: List of prediction dictionaries.
-    """
-    predictions = mmcv.load(res_path)
-    predictions = predictions['results']
-    all_preds = []
-    for sample_token in predictions.keys():
-        all_preds.extend(predictions[sample_token])
-    return all_preds
-
-
-def lyft_eval(lyft, data_root, res_path, eval_set, output_dir, logger=None):
-    """Evaluation API for Lyft dataset.
-
-    Args:
-        lyft (:obj:`LyftDataset`): Lyft class in the sdk.
-        data_root (str): Root of data for reading splits.
-        res_path (str): Path of result json file recording detections.
-        eval_set (str): Name of the split for evaluation.
-        output_dir (str): Output directory for output json files.
-        logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Default: None.
-
-    Returns:
-        dict[str, float]: The evaluation results.
-    """
-    # evaluate by lyft metrics
-    gts = load_lyft_gts(lyft, data_root, eval_set, logger)
-    predictions = load_lyft_predictions(res_path)
-
-    class_names = get_class_names(gts)
-    print('Calculating mAP@0.5:0.95...')
-
-    iou_thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
-    metrics = {}
-    average_precisions = \
-        get_classwise_aps(gts, predictions, class_names, iou_thresholds)
-    APs_data = [['IOU', 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]]
-
-    mAPs = np.mean(average_precisions, axis=0)
-    mAPs_cate = np.mean(average_precisions, axis=1)
-    final_mAP = np.mean(mAPs)
-
-    metrics['average_precisions'] = average_precisions.tolist()
-    metrics['mAPs'] = mAPs.tolist()
-    metrics['Final mAP'] = float(final_mAP)
-    metrics['class_names'] = class_names
-    metrics['mAPs_cate'] = mAPs_cate.tolist()
-
-    APs_data = [['class', 'mAP@0.5:0.95']]
-    for i in range(len(class_names)):
-        row = [class_names[i], round(mAPs_cate[i], 3)]
-        APs_data.append(row)
-    APs_data.append(['Overall', round(final_mAP, 3)])
-    APs_table = AsciiTable(APs_data, title='mAPs@0.5:0.95')
-    APs_table.inner_footing_row_border = True
-    print_log(APs_table.table, logger=logger)
-
-    res_path = osp.join(output_dir, 'lyft_metrics.json')
-    mmcv.dump(metrics, res_path)
-    return metrics
-
-
-def get_classwise_aps(gt, predictions, class_names, iou_thresholds):
-    """Returns an array with an average precision per class.
-
-    Note: Ground truth and predictions should have the following format.
-
-    .. code-block::
-
-    gt = [{
-        'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
-                         fbb039a550991a5149214f98cec136ac',
-        'translation': [974.2811881299899, 1714.6815014457964,
-                        -23.689857123368846],
-        'size': [1.796, 4.488, 1.664],
-        'rotation': [0.14882026466054782, 0, 0, 0.9888642620837121],
-        'name': 'car'
-    }]
-
-    predictions = [{
-        'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
-                         fbb039a550991a5149214f98cec136ac',
-        'translation': [971.8343488872263, 1713.6816097857359,
-                        -25.82534357061308],
-        'size': [2.519726579986132, 7.810161372666739, 3.483438286096803],
-        'rotation': [0.10913582721095375, 0.04099572636992043,
-                     0.01927712319721745, 1.029328402625659],
-        'name': 'car',
-        'score': 0.3077029437237213
-    }]
-
-    Args:
-        gt (list[dict]): list of dictionaries in the format described below.
-        predictions (list[dict]): list of dictionaries in the format
-            described below.
-        class_names (list[str]): list of the class names.
-        iou_thresholds (list[float]): IOU thresholds used to calculate
-            TP / FN
-
-    Returns:
-        np.ndarray: an array with an average precision per class.
-    """
-    assert all([0 <= iou_th <= 1 for iou_th in iou_thresholds])
-
-    gt_by_class_name = group_by_key(gt, 'name')
-    pred_by_class_name = group_by_key(predictions, 'name')
-
-    average_precisions = np.zeros((len(class_names), len(iou_thresholds)))
-
-    for class_id, class_name in enumerate(class_names):
-        if class_name in pred_by_class_name:
-            recalls, precisions, average_precision = get_single_class_aps(
-                gt_by_class_name[class_name], pred_by_class_name[class_name],
-                iou_thresholds)
-            average_precisions[class_id, :] = average_precision
-
-    return average_precisions
-
-
-def get_single_class_aps(gt, predictions, iou_thresholds):
-    """Compute recall and precision for all iou thresholds. Adapted from
-    LyftDatasetDevkit.
-
-    Args:
-        gt (list[dict]): list of dictionaries in the format described above.
-        predictions (list[dict]): list of dictionaries in the format
-            described below.
-        iou_thresholds (list[float]): IOU thresholds used to calculate
-            TP / FN
-
-    Returns:
-        tuple[np.ndarray]: Returns (recalls, precisions, average precisions)
-            for each class.
-    """
-    num_gts = len(gt)
-    image_gts = group_by_key(gt, 'sample_token')
-    image_gts = wrap_in_box(image_gts)
-
-    sample_gt_checked = {
-        sample_token: np.zeros((len(boxes), len(iou_thresholds)))
-        for sample_token, boxes in image_gts.items()
-    }
-
-    predictions = sorted(predictions, key=lambda x: x['score'], reverse=True)
-
-    # go down dets and mark TPs and FPs
-    num_predictions = len(predictions)
-    tps = np.zeros((num_predictions, len(iou_thresholds)))
-    fps = np.zeros((num_predictions, len(iou_thresholds)))
-
-    for prediction_index, prediction in enumerate(predictions):
-        predicted_box = Box3D(**prediction)
-
-        sample_token = prediction['sample_token']
-
-        max_overlap = -np.inf
-        jmax = -1
-
-        if sample_token in image_gts:
-            gt_boxes = image_gts[sample_token]
-            # gt_boxes per sample
-            gt_checked = sample_gt_checked[sample_token]
-            # gt flags per sample
-        else:
-            gt_boxes = []
-            gt_checked = None
-
-        if len(gt_boxes) > 0:
-            overlaps = get_ious(gt_boxes, predicted_box)
-
-            max_overlap = np.max(overlaps)
-
-            jmax = np.argmax(overlaps)
-
-        for i, iou_threshold in enumerate(iou_thresholds):
-            if max_overlap > iou_threshold:
-                if gt_checked[jmax, i] == 0:
-                    tps[prediction_index, i] = 1.0
-                    gt_checked[jmax, i] = 1
-                else:
-                    fps[prediction_index, i] = 1.0
-            else:
-                fps[prediction_index, i] = 1.0
-
-    # compute precision recall
-    fps = np.cumsum(fps, axis=0)
-    tps = np.cumsum(tps, axis=0)
-
-    recalls = tps / float(num_gts)
-    # avoid divide by zero in case the first detection
-    # matches a difficult ground truth
-    precisions = tps / np.maximum(tps + fps, np.finfo(np.float64).eps)
-
-    aps = []
-    for i in range(len(iou_thresholds)):
-        recall = recalls[:, i]
-        precision = precisions[:, i]
-        assert np.all(0 <= recall) & np.all(recall <= 1)
-        assert np.all(0 <= precision) & np.all(precision <= 1)
-        ap = get_ap(recall, precision)
-        aps.append(ap)
-
-    aps = np.array(aps)
-
-    return recalls, precisions, aps
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+
+import mmcv
+import numpy as np
+from lyft_dataset_sdk.eval.detection.mAP_evaluation import (Box3D, get_ap,
+                                                            get_class_names,
+                                                            get_ious,
+                                                            group_by_key,
+                                                            wrap_in_box)
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+
+def load_lyft_gts(lyft, data_root, eval_split, logger=None):
+    """Loads ground truth boxes from database.
+
+    Args:
+        lyft (:obj:`LyftDataset`): Lyft class in the sdk.
+        data_root (str): Root of data for reading splits.
+        eval_split (str): Name of the split for evaluation.
+        logger (logging.Logger | str, optional): Logger used for printing
+        related information during evaluation. Default: None.
+
+    Returns:
+        list[dict]: List of annotation dictionaries.
+    """
+    split_scenes = mmcv.list_from_file(
+        osp.join(data_root, f'{eval_split}.txt'))
+
+    # Read out all sample_tokens in DB.
+    sample_tokens_all = [s['token'] for s in lyft.sample]
+    assert len(sample_tokens_all) > 0, 'Error: Database has no samples!'
+
+    if eval_split == 'test':
+        # Check that you aren't trying to cheat :)
+        assert len(lyft.sample_annotation) > 0, \
+            'Error: You are trying to evaluate on the test set \
+             but you do not have the annotations!'
+
+    sample_tokens = []
+    for sample_token in sample_tokens_all:
+        scene_token = lyft.get('sample', sample_token)['scene_token']
+        scene_record = lyft.get('scene', scene_token)
+        if scene_record['name'] in split_scenes:
+            sample_tokens.append(sample_token)
+
+    all_annotations = []
+
+    print_log('Loading ground truth annotations...', logger=logger)
+    # Load annotations and filter predictions and annotations.
+    for sample_token in mmcv.track_iter_progress(sample_tokens):
+        sample = lyft.get('sample', sample_token)
+        sample_annotation_tokens = sample['anns']
+        for sample_annotation_token in sample_annotation_tokens:
+            # Get label name in detection task and filter unused labels.
+            sample_annotation = \
+                lyft.get('sample_annotation', sample_annotation_token)
+            detection_name = sample_annotation['category_name']
+            if detection_name is None:
+                continue
+            annotation = {
+                'sample_token': sample_token,
+                'translation': sample_annotation['translation'],
+                'size': sample_annotation['size'],
+                'rotation': sample_annotation['rotation'],
+                'name': detection_name,
+            }
+            all_annotations.append(annotation)
+
+    return all_annotations
+
+
+def load_lyft_predictions(res_path):
+    """Load Lyft predictions from json file.
+
+    Args:
+        res_path (str): Path of result json file recording detections.
+
+    Returns:
+        list[dict]: List of prediction dictionaries.
+    """
+    predictions = mmcv.load(res_path)
+    predictions = predictions['results']
+    all_preds = []
+    for sample_token in predictions.keys():
+        all_preds.extend(predictions[sample_token])
+    return all_preds
+
+
+def lyft_eval(lyft, data_root, res_path, eval_set, output_dir, logger=None):
+    """Evaluation API for Lyft dataset.
+
+    Args:
+        lyft (:obj:`LyftDataset`): Lyft class in the sdk.
+        data_root (str): Root of data for reading splits.
+        res_path (str): Path of result json file recording detections.
+        eval_set (str): Name of the split for evaluation.
+        output_dir (str): Output directory for output json files.
+        logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+
+    Returns:
+        dict[str, float]: The evaluation results.
+    """
+    # evaluate by lyft metrics
+    gts = load_lyft_gts(lyft, data_root, eval_set, logger)
+    predictions = load_lyft_predictions(res_path)
+
+    class_names = get_class_names(gts)
+    print('Calculating mAP@0.5:0.95...')
+
+    iou_thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+    metrics = {}
+    average_precisions = \
+        get_classwise_aps(gts, predictions, class_names, iou_thresholds)
+    APs_data = [['IOU', 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]]
+
+    mAPs = np.mean(average_precisions, axis=0)
+    mAPs_cate = np.mean(average_precisions, axis=1)
+    final_mAP = np.mean(mAPs)
+
+    metrics['average_precisions'] = average_precisions.tolist()
+    metrics['mAPs'] = mAPs.tolist()
+    metrics['Final mAP'] = float(final_mAP)
+    metrics['class_names'] = class_names
+    metrics['mAPs_cate'] = mAPs_cate.tolist()
+
+    APs_data = [['class', 'mAP@0.5:0.95']]
+    for i in range(len(class_names)):
+        row = [class_names[i], round(mAPs_cate[i], 3)]
+        APs_data.append(row)
+    APs_data.append(['Overall', round(final_mAP, 3)])
+    APs_table = AsciiTable(APs_data, title='mAPs@0.5:0.95')
+    APs_table.inner_footing_row_border = True
+    print_log(APs_table.table, logger=logger)
+
+    res_path = osp.join(output_dir, 'lyft_metrics.json')
+    mmcv.dump(metrics, res_path)
+    return metrics
+
+
+def get_classwise_aps(gt, predictions, class_names, iou_thresholds):
+    """Returns an array with an average precision per class.
+
+    Note: Ground truth and predictions should have the following format.
+
+    .. code-block::
+
+    gt = [{
+        'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
+                         fbb039a550991a5149214f98cec136ac',
+        'translation': [974.2811881299899, 1714.6815014457964,
+                        -23.689857123368846],
+        'size': [1.796, 4.488, 1.664],
+        'rotation': [0.14882026466054782, 0, 0, 0.9888642620837121],
+        'name': 'car'
+    }]
+
+    predictions = [{
+        'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
+                         fbb039a550991a5149214f98cec136ac',
+        'translation': [971.8343488872263, 1713.6816097857359,
+                        -25.82534357061308],
+        'size': [2.519726579986132, 7.810161372666739, 3.483438286096803],
+        'rotation': [0.10913582721095375, 0.04099572636992043,
+                     0.01927712319721745, 1.029328402625659],
+        'name': 'car',
+        'score': 0.3077029437237213
+    }]
+
+    Args:
+        gt (list[dict]): list of dictionaries in the format described below.
+        predictions (list[dict]): list of dictionaries in the format
+            described below.
+        class_names (list[str]): list of the class names.
+        iou_thresholds (list[float]): IOU thresholds used to calculate
+            TP / FN
+
+    Returns:
+        np.ndarray: an array with an average precision per class.
+    """
+    assert all([0 <= iou_th <= 1 for iou_th in iou_thresholds])
+
+    gt_by_class_name = group_by_key(gt, 'name')
+    pred_by_class_name = group_by_key(predictions, 'name')
+
+    average_precisions = np.zeros((len(class_names), len(iou_thresholds)))
+
+    for class_id, class_name in enumerate(class_names):
+        if class_name in pred_by_class_name:
+            recalls, precisions, average_precision = get_single_class_aps(
+                gt_by_class_name[class_name], pred_by_class_name[class_name],
+                iou_thresholds)
+            average_precisions[class_id, :] = average_precision
+
+    return average_precisions
+
+
+def get_single_class_aps(gt, predictions, iou_thresholds):
+    """Compute recall and precision for all iou thresholds. Adapted from
+    LyftDatasetDevkit.
+
+    Args:
+        gt (list[dict]): list of dictionaries in the format described above.
+        predictions (list[dict]): list of dictionaries in the format
+            described below.
+        iou_thresholds (list[float]): IOU thresholds used to calculate
+            TP / FN
+
+    Returns:
+        tuple[np.ndarray]: Returns (recalls, precisions, average precisions)
+            for each class.
+    """
+    num_gts = len(gt)
+    image_gts = group_by_key(gt, 'sample_token')
+    image_gts = wrap_in_box(image_gts)
+
+    sample_gt_checked = {
+        sample_token: np.zeros((len(boxes), len(iou_thresholds)))
+        for sample_token, boxes in image_gts.items()
+    }
+
+    predictions = sorted(predictions, key=lambda x: x['score'], reverse=True)
+
+    # go down dets and mark TPs and FPs
+    num_predictions = len(predictions)
+    tps = np.zeros((num_predictions, len(iou_thresholds)))
+    fps = np.zeros((num_predictions, len(iou_thresholds)))
+
+    for prediction_index, prediction in enumerate(predictions):
+        predicted_box = Box3D(**prediction)
+
+        sample_token = prediction['sample_token']
+
+        max_overlap = -np.inf
+        jmax = -1
+
+        if sample_token in image_gts:
+            gt_boxes = image_gts[sample_token]
+            # gt_boxes per sample
+            gt_checked = sample_gt_checked[sample_token]
+            # gt flags per sample
+        else:
+            gt_boxes = []
+            gt_checked = None
+
+        if len(gt_boxes) > 0:
+            overlaps = get_ious(gt_boxes, predicted_box)
+
+            max_overlap = np.max(overlaps)
+
+            jmax = np.argmax(overlaps)
+
+        for i, iou_threshold in enumerate(iou_thresholds):
+            if max_overlap > iou_threshold:
+                if gt_checked[jmax, i] == 0:
+                    tps[prediction_index, i] = 1.0
+                    gt_checked[jmax, i] = 1
+                else:
+                    fps[prediction_index, i] = 1.0
+            else:
+                fps[prediction_index, i] = 1.0
+
+    # compute precision recall
+    fps = np.cumsum(fps, axis=0)
+    tps = np.cumsum(tps, axis=0)
+
+    recalls = tps / float(num_gts)
+    # avoid divide by zero in case the first detection
+    # matches a difficult ground truth
+    precisions = tps / np.maximum(tps + fps, np.finfo(np.float64).eps)
+
+    aps = []
+    for i in range(len(iou_thresholds)):
+        recall = recalls[:, i]
+        precision = precisions[:, i]
+        assert np.all(0 <= recall) & np.all(recall <= 1)
+        assert np.all(0 <= precision) & np.all(precision <= 1)
+        ap = get_ap(recall, precision)
+        aps.append(ap)
+
+    aps = np.array(aps)
+
+    return recalls, precisions, aps
diff --git a/mmdet3d/core/evaluation/scannet_utils/__init__.py b/mmdet3d/core/evaluation/scannet_utils/__init__.py
index c98ea83..6896713 100644
--- a/mmdet3d/core/evaluation/scannet_utils/__init__.py
+++ b/mmdet3d/core/evaluation/scannet_utils/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .evaluate_semantic_instance import evaluate_matches, scannet_eval
-
-__all__ = ['scannet_eval', 'evaluate_matches']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .evaluate_semantic_instance import evaluate_matches, scannet_eval
+
+__all__ = ['scannet_eval', 'evaluate_matches']
diff --git a/mmdet3d/core/evaluation/scannet_utils/evaluate_semantic_instance.py b/mmdet3d/core/evaluation/scannet_utils/evaluate_semantic_instance.py
index e4b9439..fae003f 100644
--- a/mmdet3d/core/evaluation/scannet_utils/evaluate_semantic_instance.py
+++ b/mmdet3d/core/evaluation/scannet_utils/evaluate_semantic_instance.py
@@ -1,347 +1,347 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# adapted from https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/3d_evaluation/evaluate_semantic_instance.py # noqa
-from copy import deepcopy
-
-import numpy as np
-
-from . import util_3d
-
-
-def evaluate_matches(matches, class_labels, options):
-    """Evaluate instance segmentation from matched gt and predicted instances
-    for all scenes.
-
-    Args:
-        matches (dict): Contains gt2pred and pred2gt infos for every scene.
-        class_labels (tuple[str]): Class names.
-        options (dict): ScanNet evaluator options. See get_options.
-
-    Returns:
-        np.array: Average precision scores for all thresholds and categories.
-    """
-    overlaps = options['overlaps']
-    min_region_sizes = [options['min_region_sizes'][0]]
-    dist_threshes = [options['distance_threshes'][0]]
-    dist_confs = [options['distance_confs'][0]]
-
-    # results: class x overlap
-    ap = np.zeros((len(dist_threshes), len(class_labels), len(overlaps)),
-                  np.float)
-    for di, (min_region_size, distance_thresh, distance_conf) in enumerate(
-            zip(min_region_sizes, dist_threshes, dist_confs)):
-        for oi, overlap_th in enumerate(overlaps):
-            pred_visited = {}
-            for m in matches:
-                for label_name in class_labels:
-                    for p in matches[m]['pred'][label_name]:
-                        if 'filename' in p:
-                            pred_visited[p['filename']] = False
-            for li, label_name in enumerate(class_labels):
-                y_true = np.empty(0)
-                y_score = np.empty(0)
-                hard_false_negatives = 0
-                has_gt = False
-                has_pred = False
-                for m in matches:
-                    pred_instances = matches[m]['pred'][label_name]
-                    gt_instances = matches[m]['gt'][label_name]
-                    # filter groups in ground truth
-                    gt_instances = [
-                        gt for gt in gt_instances
-                        if gt['instance_id'] >= 1000 and gt['vert_count'] >=
-                        min_region_size and gt['med_dist'] <= distance_thresh
-                        and gt['dist_conf'] >= distance_conf
-                    ]
-                    if gt_instances:
-                        has_gt = True
-                    if pred_instances:
-                        has_pred = True
-
-                    cur_true = np.ones(len(gt_instances))
-                    cur_score = np.ones(len(gt_instances)) * (-float('inf'))
-                    cur_match = np.zeros(len(gt_instances), dtype=np.bool)
-                    # collect matches
-                    for (gti, gt) in enumerate(gt_instances):
-                        found_match = False
-                        for pred in gt['matched_pred']:
-                            # greedy assignments
-                            if pred_visited[pred['filename']]:
-                                continue
-                            overlap = float(pred['intersection']) / (
-                                gt['vert_count'] + pred['vert_count'] -
-                                pred['intersection'])
-                            if overlap > overlap_th:
-                                confidence = pred['confidence']
-                                # if already have a prediction for this gt,
-                                # the prediction with the lower score is automatically a false positive # noqa
-                                if cur_match[gti]:
-                                    max_score = max(cur_score[gti], confidence)
-                                    min_score = min(cur_score[gti], confidence)
-                                    cur_score[gti] = max_score
-                                    # append false positive
-                                    cur_true = np.append(cur_true, 0)
-                                    cur_score = np.append(cur_score, min_score)
-                                    cur_match = np.append(cur_match, True)
-                                # otherwise set score
-                                else:
-                                    found_match = True
-                                    cur_match[gti] = True
-                                    cur_score[gti] = confidence
-                                    pred_visited[pred['filename']] = True
-                        if not found_match:
-                            hard_false_negatives += 1
-                    # remove non-matched ground truth instances
-                    cur_true = cur_true[cur_match]
-                    cur_score = cur_score[cur_match]
-
-                    # collect non-matched predictions as false positive
-                    for pred in pred_instances:
-                        found_gt = False
-                        for gt in pred['matched_gt']:
-                            overlap = float(gt['intersection']) / (
-                                gt['vert_count'] + pred['vert_count'] -
-                                gt['intersection'])
-                            if overlap > overlap_th:
-                                found_gt = True
-                                break
-                        if not found_gt:
-                            num_ignore = pred['void_intersection']
-                            for gt in pred['matched_gt']:
-                                # group?
-                                if gt['instance_id'] < 1000:
-                                    num_ignore += gt['intersection']
-                                # small ground truth instances
-                                if gt['vert_count'] < min_region_size or gt[
-                                        'med_dist'] > distance_thresh or gt[
-                                            'dist_conf'] < distance_conf:
-                                    num_ignore += gt['intersection']
-                            proportion_ignore = float(
-                                num_ignore) / pred['vert_count']
-                            # if not ignored append false positive
-                            if proportion_ignore <= overlap_th:
-                                cur_true = np.append(cur_true, 0)
-                                confidence = pred['confidence']
-                                cur_score = np.append(cur_score, confidence)
-
-                    # append to overall results
-                    y_true = np.append(y_true, cur_true)
-                    y_score = np.append(y_score, cur_score)
-
-                # compute average precision
-                if has_gt and has_pred:
-                    # compute precision recall curve first
-
-                    # sorting and cumsum
-                    score_arg_sort = np.argsort(y_score)
-                    y_score_sorted = y_score[score_arg_sort]
-                    y_true_sorted = y_true[score_arg_sort]
-                    y_true_sorted_cumsum = np.cumsum(y_true_sorted)
-
-                    # unique thresholds
-                    (thresholds, unique_indices) = np.unique(
-                        y_score_sorted, return_index=True)
-                    num_prec_recall = len(unique_indices) + 1
-
-                    # prepare precision recall
-                    num_examples = len(y_score_sorted)
-                    # follow https://github.com/ScanNet/ScanNet/pull/26 ? # noqa
-                    num_true_examples = y_true_sorted_cumsum[-1] if len(
-                        y_true_sorted_cumsum) > 0 else 0
-                    precision = np.zeros(num_prec_recall)
-                    recall = np.zeros(num_prec_recall)
-
-                    # deal with the first point
-                    y_true_sorted_cumsum = np.append(y_true_sorted_cumsum, 0)
-                    # deal with remaining
-                    for idx_res, idx_scores in enumerate(unique_indices):
-                        cumsum = y_true_sorted_cumsum[idx_scores - 1]
-                        tp = num_true_examples - cumsum
-                        fp = num_examples - idx_scores - tp
-                        fn = cumsum + hard_false_negatives
-                        p = float(tp) / (tp + fp)
-                        r = float(tp) / (tp + fn)
-                        precision[idx_res] = p
-                        recall[idx_res] = r
-
-                    # first point in curve is artificial
-                    precision[-1] = 1.
-                    recall[-1] = 0.
-
-                    # compute average of precision-recall curve
-                    recall_for_conv = np.copy(recall)
-                    recall_for_conv = np.append(recall_for_conv[0],
-                                                recall_for_conv)
-                    recall_for_conv = np.append(recall_for_conv, 0.)
-
-                    stepWidths = np.convolve(recall_for_conv, [-0.5, 0, 0.5],
-                                             'valid')
-                    # integrate is now simply a dot product
-                    ap_current = np.dot(precision, stepWidths)
-
-                elif has_gt:
-                    ap_current = 0.0
-                else:
-                    ap_current = float('nan')
-                ap[di, li, oi] = ap_current
-    return ap
-
-
-def compute_averages(aps, options, class_labels):
-    """Averages AP scores for all categories.
-
-    Args:
-        aps (np.array): AP scores for all thresholds and categories.
-        options (dict): ScanNet evaluator options. See get_options.
-        class_labels (tuple[str]): Class names.
-
-    Returns:
-        dict: Overall and per-category AP scores.
-    """
-    d_inf = 0
-    o50 = np.where(np.isclose(options['overlaps'], 0.5))
-    o25 = np.where(np.isclose(options['overlaps'], 0.25))
-    o_all_but25 = np.where(
-        np.logical_not(np.isclose(options['overlaps'], 0.25)))
-    avg_dict = {}
-    avg_dict['all_ap'] = np.nanmean(aps[d_inf, :, o_all_but25])
-    avg_dict['all_ap_50%'] = np.nanmean(aps[d_inf, :, o50])
-    avg_dict['all_ap_25%'] = np.nanmean(aps[d_inf, :, o25])
-    avg_dict['classes'] = {}
-    for (li, label_name) in enumerate(class_labels):
-        avg_dict['classes'][label_name] = {}
-        avg_dict['classes'][label_name]['ap'] = np.average(aps[d_inf, li,
-                                                               o_all_but25])
-        avg_dict['classes'][label_name]['ap50%'] = np.average(aps[d_inf, li,
-                                                                  o50])
-        avg_dict['classes'][label_name]['ap25%'] = np.average(aps[d_inf, li,
-                                                                  o25])
-    return avg_dict
-
-
-def assign_instances_for_scan(pred_info, gt_ids, options, valid_class_ids,
-                              class_labels, id_to_label):
-    """Assign gt and predicted instances for a single scene.
-
-    Args:
-        pred_info (dict): Predicted masks, labels and scores.
-        gt_ids (np.array): Ground truth instance masks.
-        options (dict): ScanNet evaluator options. See get_options.
-        valid_class_ids (tuple[int]): Ids of valid categories.
-        class_labels (tuple[str]): Class names.
-        id_to_label (dict[int, str]): Mapping of valid class id to class label.
-
-    Returns:
-        dict: Per class assigned gt to predicted instances.
-        dict: Per class assigned predicted to gt instances.
-    """
-    # get gt instances
-    gt_instances = util_3d.get_instances(gt_ids, valid_class_ids, class_labels,
-                                         id_to_label)
-    # associate
-    gt2pred = deepcopy(gt_instances)
-    for label in gt2pred:
-        for gt in gt2pred[label]:
-            gt['matched_pred'] = []
-    pred2gt = {}
-    for label in class_labels:
-        pred2gt[label] = []
-    num_pred_instances = 0
-    # mask of void labels in the ground truth
-    bool_void = np.logical_not(np.in1d(gt_ids // 1000, valid_class_ids))
-    # go through all prediction masks
-    for pred_mask_file in pred_info:
-        label_id = int(pred_info[pred_mask_file]['label_id'])
-        conf = pred_info[pred_mask_file]['conf']
-        if not label_id in id_to_label:  # noqa E713
-            continue
-        label_name = id_to_label[label_id]
-        # read the mask
-        pred_mask = pred_info[pred_mask_file]['mask']
-        if len(pred_mask) != len(gt_ids):
-            raise ValueError('len(pred_mask) != len(gt_ids)')
-        # convert to binary
-        pred_mask = np.not_equal(pred_mask, 0)
-        num = np.count_nonzero(pred_mask)
-        if num < options['min_region_sizes'][0]:
-            continue  # skip if empty
-
-        pred_instance = {}
-        pred_instance['filename'] = pred_mask_file
-        pred_instance['pred_id'] = num_pred_instances
-        pred_instance['label_id'] = label_id
-        pred_instance['vert_count'] = num
-        pred_instance['confidence'] = conf
-        pred_instance['void_intersection'] = np.count_nonzero(
-            np.logical_and(bool_void, pred_mask))
-
-        # matched gt instances
-        matched_gt = []
-        # go through all gt instances with matching label
-        for (gt_num, gt_inst) in enumerate(gt2pred[label_name]):
-            intersection = np.count_nonzero(
-                np.logical_and(gt_ids == gt_inst['instance_id'], pred_mask))
-            if intersection > 0:
-                gt_copy = gt_inst.copy()
-                pred_copy = pred_instance.copy()
-                gt_copy['intersection'] = intersection
-                pred_copy['intersection'] = intersection
-                matched_gt.append(gt_copy)
-                gt2pred[label_name][gt_num]['matched_pred'].append(pred_copy)
-        pred_instance['matched_gt'] = matched_gt
-        num_pred_instances += 1
-        pred2gt[label_name].append(pred_instance)
-
-    return gt2pred, pred2gt
-
-
-def scannet_eval(preds, gts, options, valid_class_ids, class_labels,
-                 id_to_label):
-    """Evaluate instance segmentation in ScanNet protocol.
-
-    Args:
-        preds (list[dict]): Per scene predictions of mask, label and
-            confidence.
-        gts (list[np.array]): Per scene ground truth instance masks.
-        options (dict): ScanNet evaluator options. See get_options.
-        valid_class_ids (tuple[int]): Ids of valid categories.
-        class_labels (tuple[str]): Class names.
-        id_to_label (dict[int, str]): Mapping of valid class id to class label.
-
-    Returns:
-        dict: Overall and per-category AP scores.
-    """
-    options = get_options(options)
-    matches = {}
-    for i, (pred, gt) in enumerate(zip(preds, gts)):
-        matches_key = i
-        # assign gt to predictions
-        gt2pred, pred2gt = assign_instances_for_scan(pred, gt, options,
-                                                     valid_class_ids,
-                                                     class_labels, id_to_label)
-        matches[matches_key] = {}
-        matches[matches_key]['gt'] = gt2pred
-        matches[matches_key]['pred'] = pred2gt
-
-    ap_scores = evaluate_matches(matches, class_labels, options)
-    avgs = compute_averages(ap_scores, options, class_labels)
-    return avgs
-
-
-def get_options(options=None):
-    """Set ScanNet evaluator options.
-
-    Args:
-        options (dict, optional): Not default options. Default: None.
-
-    Returns:
-        dict: Updated options with all 4 keys.
-    """
-    assert options is None or isinstance(options, dict)
-    _options = dict(
-        overlaps=np.append(np.arange(0.5, 0.95, 0.05), 0.25),
-        min_region_sizes=np.array([100]),
-        distance_threshes=np.array([float('inf')]),
-        distance_confs=np.array([-float('inf')]))
-    if options is not None:
-        _options.update(options)
-    return _options
+# Copyright (c) OpenMMLab. All rights reserved.
+# adapted from https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/3d_evaluation/evaluate_semantic_instance.py # noqa
+from copy import deepcopy
+
+import numpy as np
+
+from . import util_3d
+
+
+def evaluate_matches(matches, class_labels, options):
+    """Evaluate instance segmentation from matched gt and predicted instances
+    for all scenes.
+
+    Args:
+        matches (dict): Contains gt2pred and pred2gt infos for every scene.
+        class_labels (tuple[str]): Class names.
+        options (dict): ScanNet evaluator options. See get_options.
+
+    Returns:
+        np.array: Average precision scores for all thresholds and categories.
+    """
+    overlaps = options['overlaps']
+    min_region_sizes = [options['min_region_sizes'][0]]
+    dist_threshes = [options['distance_threshes'][0]]
+    dist_confs = [options['distance_confs'][0]]
+
+    # results: class x overlap
+    ap = np.zeros((len(dist_threshes), len(class_labels), len(overlaps)),
+                  np.float)
+    for di, (min_region_size, distance_thresh, distance_conf) in enumerate(
+            zip(min_region_sizes, dist_threshes, dist_confs)):
+        for oi, overlap_th in enumerate(overlaps):
+            pred_visited = {}
+            for m in matches:
+                for label_name in class_labels:
+                    for p in matches[m]['pred'][label_name]:
+                        if 'filename' in p:
+                            pred_visited[p['filename']] = False
+            for li, label_name in enumerate(class_labels):
+                y_true = np.empty(0)
+                y_score = np.empty(0)
+                hard_false_negatives = 0
+                has_gt = False
+                has_pred = False
+                for m in matches:
+                    pred_instances = matches[m]['pred'][label_name]
+                    gt_instances = matches[m]['gt'][label_name]
+                    # filter groups in ground truth
+                    gt_instances = [
+                        gt for gt in gt_instances
+                        if gt['instance_id'] >= 1000 and gt['vert_count'] >=
+                        min_region_size and gt['med_dist'] <= distance_thresh
+                        and gt['dist_conf'] >= distance_conf
+                    ]
+                    if gt_instances:
+                        has_gt = True
+                    if pred_instances:
+                        has_pred = True
+
+                    cur_true = np.ones(len(gt_instances))
+                    cur_score = np.ones(len(gt_instances)) * (-float('inf'))
+                    cur_match = np.zeros(len(gt_instances), dtype=np.bool)
+                    # collect matches
+                    for (gti, gt) in enumerate(gt_instances):
+                        found_match = False
+                        for pred in gt['matched_pred']:
+                            # greedy assignments
+                            if pred_visited[pred['filename']]:
+                                continue
+                            overlap = float(pred['intersection']) / (
+                                gt['vert_count'] + pred['vert_count'] -
+                                pred['intersection'])
+                            if overlap > overlap_th:
+                                confidence = pred['confidence']
+                                # if already have a prediction for this gt,
+                                # the prediction with the lower score is automatically a false positive # noqa
+                                if cur_match[gti]:
+                                    max_score = max(cur_score[gti], confidence)
+                                    min_score = min(cur_score[gti], confidence)
+                                    cur_score[gti] = max_score
+                                    # append false positive
+                                    cur_true = np.append(cur_true, 0)
+                                    cur_score = np.append(cur_score, min_score)
+                                    cur_match = np.append(cur_match, True)
+                                # otherwise set score
+                                else:
+                                    found_match = True
+                                    cur_match[gti] = True
+                                    cur_score[gti] = confidence
+                                    pred_visited[pred['filename']] = True
+                        if not found_match:
+                            hard_false_negatives += 1
+                    # remove non-matched ground truth instances
+                    cur_true = cur_true[cur_match]
+                    cur_score = cur_score[cur_match]
+
+                    # collect non-matched predictions as false positive
+                    for pred in pred_instances:
+                        found_gt = False
+                        for gt in pred['matched_gt']:
+                            overlap = float(gt['intersection']) / (
+                                gt['vert_count'] + pred['vert_count'] -
+                                gt['intersection'])
+                            if overlap > overlap_th:
+                                found_gt = True
+                                break
+                        if not found_gt:
+                            num_ignore = pred['void_intersection']
+                            for gt in pred['matched_gt']:
+                                # group?
+                                if gt['instance_id'] < 1000:
+                                    num_ignore += gt['intersection']
+                                # small ground truth instances
+                                if gt['vert_count'] < min_region_size or gt[
+                                        'med_dist'] > distance_thresh or gt[
+                                            'dist_conf'] < distance_conf:
+                                    num_ignore += gt['intersection']
+                            proportion_ignore = float(
+                                num_ignore) / pred['vert_count']
+                            # if not ignored append false positive
+                            if proportion_ignore <= overlap_th:
+                                cur_true = np.append(cur_true, 0)
+                                confidence = pred['confidence']
+                                cur_score = np.append(cur_score, confidence)
+
+                    # append to overall results
+                    y_true = np.append(y_true, cur_true)
+                    y_score = np.append(y_score, cur_score)
+
+                # compute average precision
+                if has_gt and has_pred:
+                    # compute precision recall curve first
+
+                    # sorting and cumsum
+                    score_arg_sort = np.argsort(y_score)
+                    y_score_sorted = y_score[score_arg_sort]
+                    y_true_sorted = y_true[score_arg_sort]
+                    y_true_sorted_cumsum = np.cumsum(y_true_sorted)
+
+                    # unique thresholds
+                    (thresholds, unique_indices) = np.unique(
+                        y_score_sorted, return_index=True)
+                    num_prec_recall = len(unique_indices) + 1
+
+                    # prepare precision recall
+                    num_examples = len(y_score_sorted)
+                    # follow https://github.com/ScanNet/ScanNet/pull/26 ? # noqa
+                    num_true_examples = y_true_sorted_cumsum[-1] if len(
+                        y_true_sorted_cumsum) > 0 else 0
+                    precision = np.zeros(num_prec_recall)
+                    recall = np.zeros(num_prec_recall)
+
+                    # deal with the first point
+                    y_true_sorted_cumsum = np.append(y_true_sorted_cumsum, 0)
+                    # deal with remaining
+                    for idx_res, idx_scores in enumerate(unique_indices):
+                        cumsum = y_true_sorted_cumsum[idx_scores - 1]
+                        tp = num_true_examples - cumsum
+                        fp = num_examples - idx_scores - tp
+                        fn = cumsum + hard_false_negatives
+                        p = float(tp) / (tp + fp)
+                        r = float(tp) / (tp + fn)
+                        precision[idx_res] = p
+                        recall[idx_res] = r
+
+                    # first point in curve is artificial
+                    precision[-1] = 1.
+                    recall[-1] = 0.
+
+                    # compute average of precision-recall curve
+                    recall_for_conv = np.copy(recall)
+                    recall_for_conv = np.append(recall_for_conv[0],
+                                                recall_for_conv)
+                    recall_for_conv = np.append(recall_for_conv, 0.)
+
+                    stepWidths = np.convolve(recall_for_conv, [-0.5, 0, 0.5],
+                                             'valid')
+                    # integrate is now simply a dot product
+                    ap_current = np.dot(precision, stepWidths)
+
+                elif has_gt:
+                    ap_current = 0.0
+                else:
+                    ap_current = float('nan')
+                ap[di, li, oi] = ap_current
+    return ap
+
+
+def compute_averages(aps, options, class_labels):
+    """Averages AP scores for all categories.
+
+    Args:
+        aps (np.array): AP scores for all thresholds and categories.
+        options (dict): ScanNet evaluator options. See get_options.
+        class_labels (tuple[str]): Class names.
+
+    Returns:
+        dict: Overall and per-category AP scores.
+    """
+    d_inf = 0
+    o50 = np.where(np.isclose(options['overlaps'], 0.5))
+    o25 = np.where(np.isclose(options['overlaps'], 0.25))
+    o_all_but25 = np.where(
+        np.logical_not(np.isclose(options['overlaps'], 0.25)))
+    avg_dict = {}
+    avg_dict['all_ap'] = np.nanmean(aps[d_inf, :, o_all_but25])
+    avg_dict['all_ap_50%'] = np.nanmean(aps[d_inf, :, o50])
+    avg_dict['all_ap_25%'] = np.nanmean(aps[d_inf, :, o25])
+    avg_dict['classes'] = {}
+    for (li, label_name) in enumerate(class_labels):
+        avg_dict['classes'][label_name] = {}
+        avg_dict['classes'][label_name]['ap'] = np.average(aps[d_inf, li,
+                                                               o_all_but25])
+        avg_dict['classes'][label_name]['ap50%'] = np.average(aps[d_inf, li,
+                                                                  o50])
+        avg_dict['classes'][label_name]['ap25%'] = np.average(aps[d_inf, li,
+                                                                  o25])
+    return avg_dict
+
+
+def assign_instances_for_scan(pred_info, gt_ids, options, valid_class_ids,
+                              class_labels, id_to_label):
+    """Assign gt and predicted instances for a single scene.
+
+    Args:
+        pred_info (dict): Predicted masks, labels and scores.
+        gt_ids (np.array): Ground truth instance masks.
+        options (dict): ScanNet evaluator options. See get_options.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+        class_labels (tuple[str]): Class names.
+        id_to_label (dict[int, str]): Mapping of valid class id to class label.
+
+    Returns:
+        dict: Per class assigned gt to predicted instances.
+        dict: Per class assigned predicted to gt instances.
+    """
+    # get gt instances
+    gt_instances = util_3d.get_instances(gt_ids, valid_class_ids, class_labels,
+                                         id_to_label)
+    # associate
+    gt2pred = deepcopy(gt_instances)
+    for label in gt2pred:
+        for gt in gt2pred[label]:
+            gt['matched_pred'] = []
+    pred2gt = {}
+    for label in class_labels:
+        pred2gt[label] = []
+    num_pred_instances = 0
+    # mask of void labels in the ground truth
+    bool_void = np.logical_not(np.in1d(gt_ids // 1000, valid_class_ids))
+    # go through all prediction masks
+    for pred_mask_file in pred_info:
+        label_id = int(pred_info[pred_mask_file]['label_id'])
+        conf = pred_info[pred_mask_file]['conf']
+        if not label_id in id_to_label:  # noqa E713
+            continue
+        label_name = id_to_label[label_id]
+        # read the mask
+        pred_mask = pred_info[pred_mask_file]['mask']
+        if len(pred_mask) != len(gt_ids):
+            raise ValueError('len(pred_mask) != len(gt_ids)')
+        # convert to binary
+        pred_mask = np.not_equal(pred_mask, 0)
+        num = np.count_nonzero(pred_mask)
+        if num < options['min_region_sizes'][0]:
+            continue  # skip if empty
+
+        pred_instance = {}
+        pred_instance['filename'] = pred_mask_file
+        pred_instance['pred_id'] = num_pred_instances
+        pred_instance['label_id'] = label_id
+        pred_instance['vert_count'] = num
+        pred_instance['confidence'] = conf
+        pred_instance['void_intersection'] = np.count_nonzero(
+            np.logical_and(bool_void, pred_mask))
+
+        # matched gt instances
+        matched_gt = []
+        # go through all gt instances with matching label
+        for (gt_num, gt_inst) in enumerate(gt2pred[label_name]):
+            intersection = np.count_nonzero(
+                np.logical_and(gt_ids == gt_inst['instance_id'], pred_mask))
+            if intersection > 0:
+                gt_copy = gt_inst.copy()
+                pred_copy = pred_instance.copy()
+                gt_copy['intersection'] = intersection
+                pred_copy['intersection'] = intersection
+                matched_gt.append(gt_copy)
+                gt2pred[label_name][gt_num]['matched_pred'].append(pred_copy)
+        pred_instance['matched_gt'] = matched_gt
+        num_pred_instances += 1
+        pred2gt[label_name].append(pred_instance)
+
+    return gt2pred, pred2gt
+
+
+def scannet_eval(preds, gts, options, valid_class_ids, class_labels,
+                 id_to_label):
+    """Evaluate instance segmentation in ScanNet protocol.
+
+    Args:
+        preds (list[dict]): Per scene predictions of mask, label and
+            confidence.
+        gts (list[np.array]): Per scene ground truth instance masks.
+        options (dict): ScanNet evaluator options. See get_options.
+        valid_class_ids (tuple[int]): Ids of valid categories.
+        class_labels (tuple[str]): Class names.
+        id_to_label (dict[int, str]): Mapping of valid class id to class label.
+
+    Returns:
+        dict: Overall and per-category AP scores.
+    """
+    options = get_options(options)
+    matches = {}
+    for i, (pred, gt) in enumerate(zip(preds, gts)):
+        matches_key = i
+        # assign gt to predictions
+        gt2pred, pred2gt = assign_instances_for_scan(pred, gt, options,
+                                                     valid_class_ids,
+                                                     class_labels, id_to_label)
+        matches[matches_key] = {}
+        matches[matches_key]['gt'] = gt2pred
+        matches[matches_key]['pred'] = pred2gt
+
+    ap_scores = evaluate_matches(matches, class_labels, options)
+    avgs = compute_averages(ap_scores, options, class_labels)
+    return avgs
+
+
+def get_options(options=None):
+    """Set ScanNet evaluator options.
+
+    Args:
+        options (dict, optional): Not default options. Default: None.
+
+    Returns:
+        dict: Updated options with all 4 keys.
+    """
+    assert options is None or isinstance(options, dict)
+    _options = dict(
+        overlaps=np.append(np.arange(0.5, 0.95, 0.05), 0.25),
+        min_region_sizes=np.array([100]),
+        distance_threshes=np.array([float('inf')]),
+        distance_confs=np.array([-float('inf')]))
+    if options is not None:
+        _options.update(options)
+    return _options
diff --git a/mmdet3d/core/evaluation/scannet_utils/util_3d.py b/mmdet3d/core/evaluation/scannet_utils/util_3d.py
index 527d341..d75a7cc 100644
--- a/mmdet3d/core/evaluation/scannet_utils/util_3d.py
+++ b/mmdet3d/core/evaluation/scannet_utils/util_3d.py
@@ -1,84 +1,84 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# adapted from https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/util_3d.py # noqa
-import json
-
-import numpy as np
-
-
-class Instance:
-    """Single instance for ScanNet evaluator.
-
-    Args:
-        mesh_vert_instances (np.array): Instance ids for each point.
-        instance_id: Id of single instance.
-    """
-    instance_id = 0
-    label_id = 0
-    vert_count = 0
-    med_dist = -1
-    dist_conf = 0.0
-
-    def __init__(self, mesh_vert_instances, instance_id):
-        if instance_id == -1:
-            return
-        self.instance_id = int(instance_id)
-        self.label_id = int(self.get_label_id(instance_id))
-        self.vert_count = int(
-            self.get_instance_verts(mesh_vert_instances, instance_id))
-
-    @staticmethod
-    def get_label_id(instance_id):
-        return int(instance_id // 1000)
-
-    @staticmethod
-    def get_instance_verts(mesh_vert_instances, instance_id):
-        return (mesh_vert_instances == instance_id).sum()
-
-    def to_json(self):
-        return json.dumps(
-            self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
-
-    def to_dict(self):
-        dict = {}
-        dict['instance_id'] = self.instance_id
-        dict['label_id'] = self.label_id
-        dict['vert_count'] = self.vert_count
-        dict['med_dist'] = self.med_dist
-        dict['dist_conf'] = self.dist_conf
-        return dict
-
-    def from_json(self, data):
-        self.instance_id = int(data['instance_id'])
-        self.label_id = int(data['label_id'])
-        self.vert_count = int(data['vert_count'])
-        if 'med_dist' in data:
-            self.med_dist = float(data['med_dist'])
-            self.dist_conf = float(data['dist_conf'])
-
-    def __str__(self):
-        return '(' + str(self.instance_id) + ')'
-
-
-def get_instances(ids, class_ids, class_labels, id2label):
-    """Transform gt instance mask to Instance objects.
-
-    Args:
-        ids (np.array): Instance ids for each point.
-        class_ids: (tuple[int]): Ids of valid categories.
-        class_labels (tuple[str]): Class names.
-        id2label: (dict[int, str]): Mapping of valid class id to class label.
-
-    Returns:
-        dict [str, list]: Instance objects grouped by class label.
-    """
-    instances = {}
-    for label in class_labels:
-        instances[label] = []
-    instance_ids = np.unique(ids)
-    for id in instance_ids:
-        if id == 0:
-            continue
-        inst = Instance(ids, id)
-        if inst.label_id in class_ids:
-            instances[id2label[inst.label_id]].append(inst.to_dict())
-    return instances
+# Copyright (c) OpenMMLab. All rights reserved.
+# adapted from https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/util_3d.py # noqa
+import json
+
+import numpy as np
+
+
+class Instance:
+    """Single instance for ScanNet evaluator.
+
+    Args:
+        mesh_vert_instances (np.array): Instance ids for each point.
+        instance_id: Id of single instance.
+    """
+    instance_id = 0
+    label_id = 0
+    vert_count = 0
+    med_dist = -1
+    dist_conf = 0.0
+
+    def __init__(self, mesh_vert_instances, instance_id):
+        if instance_id == -1:
+            return
+        self.instance_id = int(instance_id)
+        self.label_id = int(self.get_label_id(instance_id))
+        self.vert_count = int(
+            self.get_instance_verts(mesh_vert_instances, instance_id))
+
+    @staticmethod
+    def get_label_id(instance_id):
+        return int(instance_id // 1000)
+
+    @staticmethod
+    def get_instance_verts(mesh_vert_instances, instance_id):
+        return (mesh_vert_instances == instance_id).sum()
+
+    def to_json(self):
+        return json.dumps(
+            self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
+
+    def to_dict(self):
+        dict = {}
+        dict['instance_id'] = self.instance_id
+        dict['label_id'] = self.label_id
+        dict['vert_count'] = self.vert_count
+        dict['med_dist'] = self.med_dist
+        dict['dist_conf'] = self.dist_conf
+        return dict
+
+    def from_json(self, data):
+        self.instance_id = int(data['instance_id'])
+        self.label_id = int(data['label_id'])
+        self.vert_count = int(data['vert_count'])
+        if 'med_dist' in data:
+            self.med_dist = float(data['med_dist'])
+            self.dist_conf = float(data['dist_conf'])
+
+    def __str__(self):
+        return '(' + str(self.instance_id) + ')'
+
+
+def get_instances(ids, class_ids, class_labels, id2label):
+    """Transform gt instance mask to Instance objects.
+
+    Args:
+        ids (np.array): Instance ids for each point.
+        class_ids: (tuple[int]): Ids of valid categories.
+        class_labels (tuple[str]): Class names.
+        id2label: (dict[int, str]): Mapping of valid class id to class label.
+
+    Returns:
+        dict [str, list]: Instance objects grouped by class label.
+    """
+    instances = {}
+    for label in class_labels:
+        instances[label] = []
+    instance_ids = np.unique(ids)
+    for id in instance_ids:
+        if id == 0:
+            continue
+        inst = Instance(ids, id)
+        if inst.label_id in class_ids:
+            instances[id2label[inst.label_id]].append(inst.to_dict())
+    return instances
diff --git a/mmdet3d/core/evaluation/seg_eval.py b/mmdet3d/core/evaluation/seg_eval.py
index 4a3166d..72218f5 100644
--- a/mmdet3d/core/evaluation/seg_eval.py
+++ b/mmdet3d/core/evaluation/seg_eval.py
@@ -1,131 +1,131 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-from mmcv.utils import print_log
-from terminaltables import AsciiTable
-
-
-def fast_hist(preds, labels, num_classes):
-    """Compute the confusion matrix for every batch.
-
-    Args:
-        preds (np.ndarray):  Prediction labels of points with shape of
-        (num_points, ).
-        labels (np.ndarray): Ground truth labels of points with shape of
-        (num_points, ).
-        num_classes (int): number of classes
-
-    Returns:
-        np.ndarray: Calculated confusion matrix.
-    """
-
-    k = (labels >= 0) & (labels < num_classes)
-    bin_count = np.bincount(
-        num_classes * labels[k].astype(int) + preds[k],
-        minlength=num_classes**2)
-    return bin_count[:num_classes**2].reshape(num_classes, num_classes)
-
-
-def per_class_iou(hist):
-    """Compute the per class iou.
-
-    Args:
-        hist(np.ndarray):  Overall confusion martix
-        (num_classes, num_classes ).
-
-    Returns:
-        np.ndarray: Calculated per class iou
-    """
-
-    return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
-
-
-def get_acc(hist):
-    """Compute the overall accuracy.
-
-    Args:
-        hist(np.ndarray):  Overall confusion martix
-        (num_classes, num_classes ).
-
-    Returns:
-        float: Calculated overall acc
-    """
-
-    return np.diag(hist).sum() / hist.sum()
-
-
-def get_acc_cls(hist):
-    """Compute the class average accuracy.
-
-    Args:
-        hist(np.ndarray):  Overall confusion martix
-        (num_classes, num_classes ).
-
-    Returns:
-        float: Calculated class average acc
-    """
-
-    return np.nanmean(np.diag(hist) / hist.sum(axis=1))
-
-
-def seg_eval(gt_labels, seg_preds, label2cat, ignore_index, logger=None):
-    """Semantic Segmentation  Evaluation.
-
-    Evaluate the result of the Semantic Segmentation.
-
-    Args:
-        gt_labels (list[torch.Tensor]): Ground truth labels.
-        seg_preds  (list[torch.Tensor]): Predictions.
-        label2cat (dict): Map from label to category name.
-        ignore_index (int): Index that will be ignored in evaluation.
-        logger (logging.Logger | str, optional): The way to print the mAP
-            summary. See `mmdet.utils.print_log()` for details. Default: None.
-
-    Returns:
-        dict[str, float]: Dict of results.
-    """
-    assert len(seg_preds) == len(gt_labels)
-    num_classes = len(label2cat)
-
-    hist_list = []
-    for i in range(len(gt_labels)):
-        gt_seg = gt_labels[i].clone().numpy().astype(np.int)
-        pred_seg = seg_preds[i].clone().numpy().astype(np.int)
-
-        # filter out ignored points
-        pred_seg[gt_seg == ignore_index] = -1
-        gt_seg[gt_seg == ignore_index] = -1
-
-        # calculate one instance result
-        hist_list.append(fast_hist(pred_seg, gt_seg, num_classes))
-
-    iou = per_class_iou(sum(hist_list))
-    miou = np.nanmean(iou)
-    acc = get_acc(sum(hist_list))
-    acc_cls = get_acc_cls(sum(hist_list))
-
-    header = ['classes']
-    for i in range(len(label2cat)):
-        header.append(label2cat[i])
-    header.extend(['miou', 'acc', 'acc_cls'])
-
-    ret_dict = dict()
-    table_columns = [['results']]
-    for i in range(len(label2cat)):
-        ret_dict[label2cat[i]] = float(iou[i])
-        table_columns.append([f'{iou[i]:.4f}'])
-    ret_dict['miou'] = float(miou)
-    ret_dict['acc'] = float(acc)
-    ret_dict['acc_cls'] = float(acc_cls)
-
-    table_columns.append([f'{miou:.4f}'])
-    table_columns.append([f'{acc:.4f}'])
-    table_columns.append([f'{acc_cls:.4f}'])
-
-    table_data = [header]
-    table_rows = list(zip(*table_columns))
-    table_data += table_rows
-    table = AsciiTable(table_data)
-    table.inner_footing_row_border = True
-    print_log('\n' + table.table, logger=logger)
-
-    return ret_dict
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+
+def fast_hist(preds, labels, num_classes):
+    """Compute the confusion matrix for every batch.
+
+    Args:
+        preds (np.ndarray):  Prediction labels of points with shape of
+        (num_points, ).
+        labels (np.ndarray): Ground truth labels of points with shape of
+        (num_points, ).
+        num_classes (int): number of classes
+
+    Returns:
+        np.ndarray: Calculated confusion matrix.
+    """
+
+    k = (labels >= 0) & (labels < num_classes)
+    bin_count = np.bincount(
+        num_classes * labels[k].astype(int) + preds[k],
+        minlength=num_classes**2)
+    return bin_count[:num_classes**2].reshape(num_classes, num_classes)
+
+
+def per_class_iou(hist):
+    """Compute the per class iou.
+
+    Args:
+        hist(np.ndarray):  Overall confusion martix
+        (num_classes, num_classes ).
+
+    Returns:
+        np.ndarray: Calculated per class iou
+    """
+
+    return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
+
+
+def get_acc(hist):
+    """Compute the overall accuracy.
+
+    Args:
+        hist(np.ndarray):  Overall confusion martix
+        (num_classes, num_classes ).
+
+    Returns:
+        float: Calculated overall acc
+    """
+
+    return np.diag(hist).sum() / hist.sum()
+
+
+def get_acc_cls(hist):
+    """Compute the class average accuracy.
+
+    Args:
+        hist(np.ndarray):  Overall confusion martix
+        (num_classes, num_classes ).
+
+    Returns:
+        float: Calculated class average acc
+    """
+
+    return np.nanmean(np.diag(hist) / hist.sum(axis=1))
+
+
+def seg_eval(gt_labels, seg_preds, label2cat, ignore_index, logger=None):
+    """Semantic Segmentation  Evaluation.
+
+    Evaluate the result of the Semantic Segmentation.
+
+    Args:
+        gt_labels (list[torch.Tensor]): Ground truth labels.
+        seg_preds  (list[torch.Tensor]): Predictions.
+        label2cat (dict): Map from label to category name.
+        ignore_index (int): Index that will be ignored in evaluation.
+        logger (logging.Logger | str, optional): The way to print the mAP
+            summary. See `mmdet.utils.print_log()` for details. Default: None.
+
+    Returns:
+        dict[str, float]: Dict of results.
+    """
+    assert len(seg_preds) == len(gt_labels)
+    num_classes = len(label2cat)
+
+    hist_list = []
+    for i in range(len(gt_labels)):
+        gt_seg = gt_labels[i].clone().numpy().astype(np.int)
+        pred_seg = seg_preds[i].clone().numpy().astype(np.int)
+
+        # filter out ignored points
+        pred_seg[gt_seg == ignore_index] = -1
+        gt_seg[gt_seg == ignore_index] = -1
+
+        # calculate one instance result
+        hist_list.append(fast_hist(pred_seg, gt_seg, num_classes))
+
+    iou = per_class_iou(sum(hist_list))
+    miou = np.nanmean(iou)
+    acc = get_acc(sum(hist_list))
+    acc_cls = get_acc_cls(sum(hist_list))
+
+    header = ['classes']
+    for i in range(len(label2cat)):
+        header.append(label2cat[i])
+    header.extend(['miou', 'acc', 'acc_cls'])
+
+    ret_dict = dict()
+    table_columns = [['results']]
+    for i in range(len(label2cat)):
+        ret_dict[label2cat[i]] = float(iou[i])
+        table_columns.append([f'{iou[i]:.4f}'])
+    ret_dict['miou'] = float(miou)
+    ret_dict['acc'] = float(acc)
+    ret_dict['acc_cls'] = float(acc_cls)
+
+    table_columns.append([f'{miou:.4f}'])
+    table_columns.append([f'{acc:.4f}'])
+    table_columns.append([f'{acc_cls:.4f}'])
+
+    table_data = [header]
+    table_rows = list(zip(*table_columns))
+    table_data += table_rows
+    table = AsciiTable(table_data)
+    table.inner_footing_row_border = True
+    print_log('\n' + table.table, logger=logger)
+
+    return ret_dict
diff --git a/mmdet3d/core/evaluation/waymo_utils/__init__.py b/mmdet3d/core/evaluation/waymo_utils/__init__.py
index 72d3a9b..f6cf3e9 100644
--- a/mmdet3d/core/evaluation/waymo_utils/__init__.py
+++ b/mmdet3d/core/evaluation/waymo_utils/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .prediction_kitti_to_waymo import KITTI2Waymo
-
-__all__ = ['KITTI2Waymo']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .prediction_kitti_to_waymo import KITTI2Waymo
+
+__all__ = ['KITTI2Waymo']
diff --git a/mmdet3d/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py b/mmdet3d/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py
index 205c24c..3ecc85a 100644
--- a/mmdet3d/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py
+++ b/mmdet3d/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py
@@ -1,263 +1,263 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-r"""Adapted from `Waymo to KITTI converter
-    <https://github.com/caizhongang/waymo_kitti_converter>`_.
-"""
-
-try:
-    from waymo_open_dataset import dataset_pb2 as open_dataset
-except ImportError:
-    raise ImportError(
-        'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
-        'to install the official devkit first.')
-
-from glob import glob
-from os.path import join
-
-import mmcv
-import numpy as np
-import tensorflow as tf
-from waymo_open_dataset import label_pb2
-from waymo_open_dataset.protos import metrics_pb2
-
-
-class KITTI2Waymo(object):
-    """KITTI predictions to Waymo converter.
-
-    This class serves as the converter to change predictions from KITTI to
-    Waymo format.
-
-    Args:
-        kitti_result_files (list[dict]): Predictions in KITTI format.
-        waymo_tfrecords_dir (str): Directory to load waymo raw data.
-        waymo_results_save_dir (str): Directory to save converted predictions
-            in waymo format (.bin files).
-        waymo_results_final_path (str): Path to save combined
-            predictions in waymo format (.bin file), like 'a/b/c.bin'.
-        prefix (str): Prefix of filename. In general, 0 for training, 1 for
-            validation and 2 for testing.
-        workers (str): Number of parallel processes.
-    """
-
-    def __init__(self,
-                 kitti_result_files,
-                 waymo_tfrecords_dir,
-                 waymo_results_save_dir,
-                 waymo_results_final_path,
-                 prefix,
-                 workers=64):
-
-        self.kitti_result_files = kitti_result_files
-        self.waymo_tfrecords_dir = waymo_tfrecords_dir
-        self.waymo_results_save_dir = waymo_results_save_dir
-        self.waymo_results_final_path = waymo_results_final_path
-        self.prefix = prefix
-        self.workers = int(workers)
-        self.name2idx = {}
-        for idx, result in enumerate(kitti_result_files):
-            if len(result['sample_idx']) > 0:
-                self.name2idx[str(result['sample_idx'][0])] = idx
-
-        # turn on eager execution for older tensorflow versions
-        if int(tf.__version__.split('.')[0]) < 2:
-            tf.enable_eager_execution()
-
-        self.k2w_cls_map = {
-            'Car': label_pb2.Label.TYPE_VEHICLE,
-            'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN,
-            'Sign': label_pb2.Label.TYPE_SIGN,
-            'Cyclist': label_pb2.Label.TYPE_CYCLIST,
-        }
-
-        self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
-                                            [-1.0, 0.0, 0.0, 0.0],
-                                            [0.0, -1.0, 0.0, 0.0],
-                                            [0.0, 0.0, 0.0, 1.0]])
-
-        self.get_file_names()
-        self.create_folder()
-
-    def get_file_names(self):
-        """Get file names of waymo raw data."""
-        self.waymo_tfrecord_pathnames = sorted(
-            glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
-        print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
-
-    def create_folder(self):
-        """Create folder for data conversion."""
-        mmcv.mkdir_or_exist(self.waymo_results_save_dir)
-
-    def parse_objects(self, kitti_result, T_k2w, context_name,
-                      frame_timestamp_micros):
-        """Parse one prediction with several instances in kitti format and
-        convert them to `Object` proto.
-
-        Args:
-            kitti_result (dict): Predictions in kitti format.
-
-                - name (np.ndarray): Class labels of predictions.
-                - dimensions (np.ndarray): Height, width, length of boxes.
-                - location (np.ndarray): Bottom center of boxes (x, y, z).
-                - rotation_y (np.ndarray): Orientation of boxes.
-                - score (np.ndarray): Scores of predictions.
-            T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
-            context_name (str): Context name of the frame.
-            frame_timestamp_micros (int): Frame timestamp.
-
-        Returns:
-            :obj:`Object`: Predictions in waymo dataset Object proto.
-        """
-
-        def parse_one_object(instance_idx):
-            """Parse one instance in kitti format and convert them to `Object`
-            proto.
-
-            Args:
-                instance_idx (int): Index of the instance to be converted.
-
-            Returns:
-                :obj:`Object`: Predicted instance in waymo dataset
-                    Object proto.
-            """
-            cls = kitti_result['name'][instance_idx]
-            length = round(kitti_result['dimensions'][instance_idx, 0], 4)
-            height = round(kitti_result['dimensions'][instance_idx, 1], 4)
-            width = round(kitti_result['dimensions'][instance_idx, 2], 4)
-            x = round(kitti_result['location'][instance_idx, 0], 4)
-            y = round(kitti_result['location'][instance_idx, 1], 4)
-            z = round(kitti_result['location'][instance_idx, 2], 4)
-            rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
-            score = round(kitti_result['score'][instance_idx], 4)
-
-            # y: downwards; move box origin from bottom center (kitti) to
-            # true center (waymo)
-            y -= height / 2
-            # frame transformation: kitti -> waymo
-            x, y, z = self.transform(T_k2w, x, y, z)
-
-            # different conventions
-            heading = -(rotation_y + np.pi / 2)
-            while heading < -np.pi:
-                heading += 2 * np.pi
-            while heading > np.pi:
-                heading -= 2 * np.pi
-
-            box = label_pb2.Label.Box()
-            box.center_x = x
-            box.center_y = y
-            box.center_z = z
-            box.length = length
-            box.width = width
-            box.height = height
-            box.heading = heading
-
-            o = metrics_pb2.Object()
-            o.object.box.CopyFrom(box)
-            o.object.type = self.k2w_cls_map[cls]
-            o.score = score
-
-            o.context_name = context_name
-            o.frame_timestamp_micros = frame_timestamp_micros
-
-            return o
-
-        objects = metrics_pb2.Objects()
-
-        for instance_idx in range(len(kitti_result['name'])):
-            o = parse_one_object(instance_idx)
-            objects.objects.append(o)
-
-        return objects
-
-    def convert_one(self, file_idx):
-        """Convert action for single file.
-
-        Args:
-            file_idx (int): Index of the file to be converted.
-        """
-        file_pathname = self.waymo_tfrecord_pathnames[file_idx]
-        file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
-
-        for frame_num, frame_data in enumerate(file_data):
-            frame = open_dataset.Frame()
-            frame.ParseFromString(bytearray(frame_data.numpy()))
-
-            filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
-
-            for camera in frame.context.camera_calibrations:
-                # FRONT = 1, see dataset.proto for details
-                if camera.name == 1:
-                    T_front_cam_to_vehicle = np.array(
-                        camera.extrinsic.transform).reshape(4, 4)
-
-            T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
-
-            context_name = frame.context.name
-            frame_timestamp_micros = frame.timestamp_micros
-
-            if filename in self.name2idx:
-                kitti_result = \
-                    self.kitti_result_files[self.name2idx[filename]]
-                objects = self.parse_objects(kitti_result, T_k2w, context_name,
-                                             frame_timestamp_micros)
-            else:
-                print(filename, 'not found.')
-                objects = metrics_pb2.Objects()
-
-            with open(
-                    join(self.waymo_results_save_dir, f'{filename}.bin'),
-                    'wb') as f:
-                f.write(objects.SerializeToString())
-
-    def convert(self):
-        """Convert action."""
-        print('Start converting ...')
-        mmcv.track_parallel_progress(self.convert_one, range(len(self)),
-                                     self.workers)
-        print('\nFinished ...')
-
-        # combine all files into one .bin
-        pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
-        combined = self.combine(pathnames)
-
-        with open(self.waymo_results_final_path, 'wb') as f:
-            f.write(combined.SerializeToString())
-
-    def __len__(self):
-        """Length of the filename list."""
-        return len(self.waymo_tfrecord_pathnames)
-
-    def transform(self, T, x, y, z):
-        """Transform the coordinates with matrix T.
-
-        Args:
-            T (np.ndarray): Transformation matrix.
-            x(float): Coordinate in x axis.
-            y(float): Coordinate in y axis.
-            z(float): Coordinate in z axis.
-
-        Returns:
-            list: Coordinates after transformation.
-        """
-        pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
-        pt_aft = np.matmul(T, pt_bef)
-        return pt_aft[:3].flatten().tolist()
-
-    def combine(self, pathnames):
-        """Combine predictions in waymo format for each sample together.
-
-        Args:
-            pathnames (str): Paths to save predictions.
-
-        Returns:
-            :obj:`Objects`: Combined predictions in Objects proto.
-        """
-        combined = metrics_pb2.Objects()
-
-        for pathname in pathnames:
-            objects = metrics_pb2.Objects()
-            with open(pathname, 'rb') as f:
-                objects.ParseFromString(f.read())
-            for o in objects.objects:
-                combined.objects.append(o)
-
-        return combined
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+    <https://github.com/caizhongang/waymo_kitti_converter>`_.
+"""
+
+try:
+    from waymo_open_dataset import dataset_pb2 as open_dataset
+except ImportError:
+    raise ImportError(
+        'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
+        'to install the official devkit first.')
+
+from glob import glob
+from os.path import join
+
+import mmcv
+import numpy as np
+import tensorflow as tf
+from waymo_open_dataset import label_pb2
+from waymo_open_dataset.protos import metrics_pb2
+
+
+class KITTI2Waymo(object):
+    """KITTI predictions to Waymo converter.
+
+    This class serves as the converter to change predictions from KITTI to
+    Waymo format.
+
+    Args:
+        kitti_result_files (list[dict]): Predictions in KITTI format.
+        waymo_tfrecords_dir (str): Directory to load waymo raw data.
+        waymo_results_save_dir (str): Directory to save converted predictions
+            in waymo format (.bin files).
+        waymo_results_final_path (str): Path to save combined
+            predictions in waymo format (.bin file), like 'a/b/c.bin'.
+        prefix (str): Prefix of filename. In general, 0 for training, 1 for
+            validation and 2 for testing.
+        workers (str): Number of parallel processes.
+    """
+
+    def __init__(self,
+                 kitti_result_files,
+                 waymo_tfrecords_dir,
+                 waymo_results_save_dir,
+                 waymo_results_final_path,
+                 prefix,
+                 workers=64):
+
+        self.kitti_result_files = kitti_result_files
+        self.waymo_tfrecords_dir = waymo_tfrecords_dir
+        self.waymo_results_save_dir = waymo_results_save_dir
+        self.waymo_results_final_path = waymo_results_final_path
+        self.prefix = prefix
+        self.workers = int(workers)
+        self.name2idx = {}
+        for idx, result in enumerate(kitti_result_files):
+            if len(result['sample_idx']) > 0:
+                self.name2idx[str(result['sample_idx'][0])] = idx
+
+        # turn on eager execution for older tensorflow versions
+        if int(tf.__version__.split('.')[0]) < 2:
+            tf.enable_eager_execution()
+
+        self.k2w_cls_map = {
+            'Car': label_pb2.Label.TYPE_VEHICLE,
+            'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN,
+            'Sign': label_pb2.Label.TYPE_SIGN,
+            'Cyclist': label_pb2.Label.TYPE_CYCLIST,
+        }
+
+        self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
+                                            [-1.0, 0.0, 0.0, 0.0],
+                                            [0.0, -1.0, 0.0, 0.0],
+                                            [0.0, 0.0, 0.0, 1.0]])
+
+        self.get_file_names()
+        self.create_folder()
+
+    def get_file_names(self):
+        """Get file names of waymo raw data."""
+        self.waymo_tfrecord_pathnames = sorted(
+            glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
+        print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
+
+    def create_folder(self):
+        """Create folder for data conversion."""
+        mmcv.mkdir_or_exist(self.waymo_results_save_dir)
+
+    def parse_objects(self, kitti_result, T_k2w, context_name,
+                      frame_timestamp_micros):
+        """Parse one prediction with several instances in kitti format and
+        convert them to `Object` proto.
+
+        Args:
+            kitti_result (dict): Predictions in kitti format.
+
+                - name (np.ndarray): Class labels of predictions.
+                - dimensions (np.ndarray): Height, width, length of boxes.
+                - location (np.ndarray): Bottom center of boxes (x, y, z).
+                - rotation_y (np.ndarray): Orientation of boxes.
+                - score (np.ndarray): Scores of predictions.
+            T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
+            context_name (str): Context name of the frame.
+            frame_timestamp_micros (int): Frame timestamp.
+
+        Returns:
+            :obj:`Object`: Predictions in waymo dataset Object proto.
+        """
+
+        def parse_one_object(instance_idx):
+            """Parse one instance in kitti format and convert them to `Object`
+            proto.
+
+            Args:
+                instance_idx (int): Index of the instance to be converted.
+
+            Returns:
+                :obj:`Object`: Predicted instance in waymo dataset
+                    Object proto.
+            """
+            cls = kitti_result['name'][instance_idx]
+            length = round(kitti_result['dimensions'][instance_idx, 0], 4)
+            height = round(kitti_result['dimensions'][instance_idx, 1], 4)
+            width = round(kitti_result['dimensions'][instance_idx, 2], 4)
+            x = round(kitti_result['location'][instance_idx, 0], 4)
+            y = round(kitti_result['location'][instance_idx, 1], 4)
+            z = round(kitti_result['location'][instance_idx, 2], 4)
+            rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
+            score = round(kitti_result['score'][instance_idx], 4)
+
+            # y: downwards; move box origin from bottom center (kitti) to
+            # true center (waymo)
+            y -= height / 2
+            # frame transformation: kitti -> waymo
+            x, y, z = self.transform(T_k2w, x, y, z)
+
+            # different conventions
+            heading = -(rotation_y + np.pi / 2)
+            while heading < -np.pi:
+                heading += 2 * np.pi
+            while heading > np.pi:
+                heading -= 2 * np.pi
+
+            box = label_pb2.Label.Box()
+            box.center_x = x
+            box.center_y = y
+            box.center_z = z
+            box.length = length
+            box.width = width
+            box.height = height
+            box.heading = heading
+
+            o = metrics_pb2.Object()
+            o.object.box.CopyFrom(box)
+            o.object.type = self.k2w_cls_map[cls]
+            o.score = score
+
+            o.context_name = context_name
+            o.frame_timestamp_micros = frame_timestamp_micros
+
+            return o
+
+        objects = metrics_pb2.Objects()
+
+        for instance_idx in range(len(kitti_result['name'])):
+            o = parse_one_object(instance_idx)
+            objects.objects.append(o)
+
+        return objects
+
+    def convert_one(self, file_idx):
+        """Convert action for single file.
+
+        Args:
+            file_idx (int): Index of the file to be converted.
+        """
+        file_pathname = self.waymo_tfrecord_pathnames[file_idx]
+        file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
+
+        for frame_num, frame_data in enumerate(file_data):
+            frame = open_dataset.Frame()
+            frame.ParseFromString(bytearray(frame_data.numpy()))
+
+            filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
+
+            for camera in frame.context.camera_calibrations:
+                # FRONT = 1, see dataset.proto for details
+                if camera.name == 1:
+                    T_front_cam_to_vehicle = np.array(
+                        camera.extrinsic.transform).reshape(4, 4)
+
+            T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
+
+            context_name = frame.context.name
+            frame_timestamp_micros = frame.timestamp_micros
+
+            if filename in self.name2idx:
+                kitti_result = \
+                    self.kitti_result_files[self.name2idx[filename]]
+                objects = self.parse_objects(kitti_result, T_k2w, context_name,
+                                             frame_timestamp_micros)
+            else:
+                print(filename, 'not found.')
+                objects = metrics_pb2.Objects()
+
+            with open(
+                    join(self.waymo_results_save_dir, f'{filename}.bin'),
+                    'wb') as f:
+                f.write(objects.SerializeToString())
+
+    def convert(self):
+        """Convert action."""
+        print('Start converting ...')
+        mmcv.track_parallel_progress(self.convert_one, range(len(self)),
+                                     self.workers)
+        print('\nFinished ...')
+
+        # combine all files into one .bin
+        pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
+        combined = self.combine(pathnames)
+
+        with open(self.waymo_results_final_path, 'wb') as f:
+            f.write(combined.SerializeToString())
+
+    def __len__(self):
+        """Length of the filename list."""
+        return len(self.waymo_tfrecord_pathnames)
+
+    def transform(self, T, x, y, z):
+        """Transform the coordinates with matrix T.
+
+        Args:
+            T (np.ndarray): Transformation matrix.
+            x(float): Coordinate in x axis.
+            y(float): Coordinate in y axis.
+            z(float): Coordinate in z axis.
+
+        Returns:
+            list: Coordinates after transformation.
+        """
+        pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
+        pt_aft = np.matmul(T, pt_bef)
+        return pt_aft[:3].flatten().tolist()
+
+    def combine(self, pathnames):
+        """Combine predictions in waymo format for each sample together.
+
+        Args:
+            pathnames (str): Paths to save predictions.
+
+        Returns:
+            :obj:`Objects`: Combined predictions in Objects proto.
+        """
+        combined = metrics_pb2.Objects()
+
+        for pathname in pathnames:
+            objects = metrics_pb2.Objects()
+            with open(pathname, 'rb') as f:
+                objects.ParseFromString(f.read())
+            for o in objects.objects:
+                combined.objects.append(o)
+
+        return combined
diff --git a/mmdet3d/core/points/__init__.py b/mmdet3d/core/points/__init__.py
index 73d2d83..e88406b 100644
--- a/mmdet3d/core/points/__init__.py
+++ b/mmdet3d/core/points/__init__.py
@@ -1,30 +1,30 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base_points import BasePoints
-from .cam_points import CameraPoints
-from .depth_points import DepthPoints
-from .lidar_points import LiDARPoints
-
-__all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints']
-
-
-def get_points_type(points_type):
-    """Get the class of points according to coordinate type.
-
-    Args:
-        points_type (str): The type of points coordinate.
-            The valid value are "CAMERA", "LIDAR", or "DEPTH".
-
-    Returns:
-        class: Points type.
-    """
-    if points_type == 'CAMERA':
-        points_cls = CameraPoints
-    elif points_type == 'LIDAR':
-        points_cls = LiDARPoints
-    elif points_type == 'DEPTH':
-        points_cls = DepthPoints
-    else:
-        raise ValueError('Only "points_type" of "CAMERA", "LIDAR", or "DEPTH"'
-                         f' are supported, got {points_type}')
-
-    return points_cls
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+from .cam_points import CameraPoints
+from .depth_points import DepthPoints
+from .lidar_points import LiDARPoints
+
+__all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints']
+
+
+def get_points_type(points_type):
+    """Get the class of points according to coordinate type.
+
+    Args:
+        points_type (str): The type of points coordinate.
+            The valid value are "CAMERA", "LIDAR", or "DEPTH".
+
+    Returns:
+        class: Points type.
+    """
+    if points_type == 'CAMERA':
+        points_cls = CameraPoints
+    elif points_type == 'LIDAR':
+        points_cls = LiDARPoints
+    elif points_type == 'DEPTH':
+        points_cls = DepthPoints
+    else:
+        raise ValueError('Only "points_type" of "CAMERA", "LIDAR", or "DEPTH"'
+                         f' are supported, got {points_type}')
+
+    return points_cls
diff --git a/mmdet3d/core/points/base_points.py b/mmdet3d/core/points/base_points.py
index 929fa21..ed2faf6 100644
--- a/mmdet3d/core/points/base_points.py
+++ b/mmdet3d/core/points/base_points.py
@@ -1,440 +1,440 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from abc import abstractmethod
-
-import numpy as np
-import torch
-
-from ..bbox.structures.utils import rotation_3d_in_axis
-
-
-class BasePoints(object):
-    """Base class for Points.
-
-    Args:
-        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
-        points_dim (int, optional): Number of the dimension of a point.
-            Each row is (x, y, z). Defaults to 3.
-        attribute_dims (dict, optional): Dictionary to indicate the
-            meaning of extra dimension. Defaults to None.
-
-    Attributes:
-        tensor (torch.Tensor): Float matrix of N x points_dim.
-        points_dim (int): Integer indicating the dimension of a point.
-            Each row is (x, y, z, ...).
-        attribute_dims (bool): Dictionary to indicate the meaning of extra
-            dimension. Defaults to None.
-        rotation_axis (int): Default rotation axis for points rotation.
-    """
-
-    def __init__(self, tensor, points_dim=3, attribute_dims=None):
-        if isinstance(tensor, torch.Tensor):
-            device = tensor.device
-        else:
-            device = torch.device('cpu')
-        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
-        if tensor.numel() == 0:
-            # Use reshape, so we don't end up creating a new tensor that
-            # does not depend on the inputs (and consequently confuses jit)
-            tensor = tensor.reshape((0, points_dim)).to(
-                dtype=torch.float32, device=device)
-        assert tensor.dim() == 2 and tensor.size(-1) == \
-            points_dim, tensor.size()
-
-        self.tensor = tensor
-        self.points_dim = points_dim
-        self.attribute_dims = attribute_dims
-        self.rotation_axis = 0
-
-    @property
-    def coord(self):
-        """torch.Tensor: Coordinates of each point in shape (N, 3)."""
-        return self.tensor[:, :3]
-
-    @coord.setter
-    def coord(self, tensor):
-        """Set the coordinates of each point."""
-        try:
-            tensor = tensor.reshape(self.shape[0], 3)
-        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
-            raise ValueError(f'got unexpected shape {tensor.shape}')
-        if not isinstance(tensor, torch.Tensor):
-            tensor = self.tensor.new_tensor(tensor)
-        self.tensor[:, :3] = tensor
-
-    @property
-    def height(self):
-        """torch.Tensor:
-            A vector with height of each point in shape (N, 1), or None."""
-        if self.attribute_dims is not None and \
-                'height' in self.attribute_dims.keys():
-            return self.tensor[:, self.attribute_dims['height']]
-        else:
-            return None
-
-    @height.setter
-    def height(self, tensor):
-        """Set the height of each point."""
-        try:
-            tensor = tensor.reshape(self.shape[0])
-        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
-            raise ValueError(f'got unexpected shape {tensor.shape}')
-        if not isinstance(tensor, torch.Tensor):
-            tensor = self.tensor.new_tensor(tensor)
-        if self.attribute_dims is not None and \
-                'height' in self.attribute_dims.keys():
-            self.tensor[:, self.attribute_dims['height']] = tensor
-        else:
-            # add height attribute
-            if self.attribute_dims is None:
-                self.attribute_dims = dict()
-            attr_dim = self.shape[1]
-            self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1)
-            self.attribute_dims.update(dict(height=attr_dim))
-            self.points_dim += 1
-
-    @property
-    def color(self):
-        """torch.Tensor:
-            A vector with color of each point in shape (N, 3), or None."""
-        if self.attribute_dims is not None and \
-                'color' in self.attribute_dims.keys():
-            return self.tensor[:, self.attribute_dims['color']]
-        else:
-            return None
-
-    @color.setter
-    def color(self, tensor):
-        """Set the color of each point."""
-        try:
-            tensor = tensor.reshape(self.shape[0], 3)
-        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
-            raise ValueError(f'got unexpected shape {tensor.shape}')
-        if tensor.max() >= 256 or tensor.min() < 0:
-            warnings.warn('point got color value beyond [0, 255]')
-        if not isinstance(tensor, torch.Tensor):
-            tensor = self.tensor.new_tensor(tensor)
-        if self.attribute_dims is not None and \
-                'color' in self.attribute_dims.keys():
-            self.tensor[:, self.attribute_dims['color']] = tensor
-        else:
-            # add color attribute
-            if self.attribute_dims is None:
-                self.attribute_dims = dict()
-            attr_dim = self.shape[1]
-            self.tensor = torch.cat([self.tensor, tensor], dim=1)
-            self.attribute_dims.update(
-                dict(color=[attr_dim, attr_dim + 1, attr_dim + 2]))
-            self.points_dim += 3
-
-    @property
-    def shape(self):
-        """torch.Shape: Shape of points."""
-        return self.tensor.shape
-
-    def shuffle(self):
-        """Shuffle the points.
-
-        Returns:
-            torch.Tensor: The shuffled index.
-        """
-        idx = torch.randperm(self.__len__(), device=self.tensor.device)
-        self.tensor = self.tensor[idx]
-        return idx
-
-    def rotate(self, rotation, axis=None):
-        """Rotate points with the given rotation matrix or angle.
-
-        Args:
-            rotation (float | np.ndarray | torch.Tensor): Rotation matrix
-                or angle.
-            axis (int, optional): Axis to rotate at. Defaults to None.
-        """
-        if not isinstance(rotation, torch.Tensor):
-            rotation = self.tensor.new_tensor(rotation)
-        assert rotation.shape == torch.Size([3, 3]) or \
-            rotation.numel() == 1, f'invalid rotation shape {rotation.shape}'
-
-        if axis is None:
-            axis = self.rotation_axis
-
-        if rotation.numel() == 1:
-            rotated_points, rot_mat_T = rotation_3d_in_axis(
-                self.tensor[:, :3][None], rotation, axis=axis, return_mat=True)
-            self.tensor[:, :3] = rotated_points.squeeze(0)
-            rot_mat_T = rot_mat_T.squeeze(0)
-        else:
-            # rotation.numel() == 9
-            self.tensor[:, :3] = self.tensor[:, :3] @ rotation
-            rot_mat_T = rotation
-
-        return rot_mat_T
-
-    @abstractmethod
-    def flip(self, bev_direction='horizontal'):
-        """Flip the points along given BEV direction.
-
-        Args:
-            bev_direction (str): Flip direction (horizontal or vertical).
-        """
-        pass
-
-    def translate(self, trans_vector):
-        """Translate points with the given translation vector.
-
-        Args:
-            trans_vector (np.ndarray, torch.Tensor): Translation
-                vector of size 3 or nx3.
-        """
-        if not isinstance(trans_vector, torch.Tensor):
-            trans_vector = self.tensor.new_tensor(trans_vector)
-        trans_vector = trans_vector.squeeze(0)
-        if trans_vector.dim() == 1:
-            assert trans_vector.shape[0] == 3
-        elif trans_vector.dim() == 2:
-            assert trans_vector.shape[0] == self.tensor.shape[0] and \
-                trans_vector.shape[1] == 3
-        else:
-            raise NotImplementedError(
-                f'Unsupported translation vector of shape {trans_vector.shape}'
-            )
-        self.tensor[:, :3] += trans_vector
-
-    def in_range_3d(self, point_range):
-        """Check whether the points are in the given range.
-
-        Args:
-            point_range (list | torch.Tensor): The range of point
-                (x_min, y_min, z_min, x_max, y_max, z_max)
-
-        Note:
-            In the original implementation of SECOND, checking whether
-            a box in the range checks whether the points are in a convex
-            polygon, we try to reduce the burden for simpler cases.
-
-        Returns:
-            torch.Tensor: A binary vector indicating whether each point is
-                inside the reference range.
-        """
-        in_range_flags = ((self.tensor[:, 0] > point_range[0])
-                          & (self.tensor[:, 1] > point_range[1])
-                          & (self.tensor[:, 2] > point_range[2])
-                          & (self.tensor[:, 0] < point_range[3])
-                          & (self.tensor[:, 1] < point_range[4])
-                          & (self.tensor[:, 2] < point_range[5]))
-        return in_range_flags
-
-    @property
-    def bev(self):
-        """torch.Tensor: BEV of the points in shape (N, 2)."""
-        return self.tensor[:, [0, 1]]
-
-    def in_range_bev(self, point_range):
-        """Check whether the points are in the given range.
-
-        Args:
-            point_range (list | torch.Tensor): The range of point
-                in order of (x_min, y_min, x_max, y_max).
-
-        Returns:
-            torch.Tensor: Indicating whether each point is inside
-                the reference range.
-        """
-        in_range_flags = ((self.bev[:, 0] > point_range[0])
-                          & (self.bev[:, 1] > point_range[1])
-                          & (self.bev[:, 0] < point_range[2])
-                          & (self.bev[:, 1] < point_range[3]))
-        return in_range_flags
-
-    @abstractmethod
-    def convert_to(self, dst, rt_mat=None):
-        """Convert self to ``dst`` mode.
-
-        Args:
-            dst (:obj:`CoordMode`): The target Box mode.
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from `src` coordinates to `dst` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-
-        Returns:
-            :obj:`BasePoints`: The converted box of the same type
-                in the `dst` mode.
-        """
-        pass
-
-    def scale(self, scale_factor):
-        """Scale the points with horizontal and vertical scaling factors.
-
-        Args:
-            scale_factors (float): Scale factors to scale the points.
-        """
-        self.tensor[:, :3] *= scale_factor
-
-    def __getitem__(self, item):
-        """
-        Note:
-            The following usage are allowed:
-            1. `new_points = points[3]`:
-                return a `Points` that contains only one point.
-            2. `new_points = points[2:10]`:
-                return a slice of points.
-            3. `new_points = points[vector]`:
-                where vector is a torch.BoolTensor with `length = len(points)`.
-                Nonzero elements in the vector will be selected.
-            4. `new_points = points[3:11, vector]`:
-                return a slice of points and attribute dims.
-            5. `new_points = points[4:12, 2]`:
-                return a slice of points with single attribute.
-            Note that the returned Points might share storage with this Points,
-            subject to Pytorch's indexing semantics.
-
-        Returns:
-            :obj:`BasePoints`: A new object of
-                :class:`BasePoints` after indexing.
-        """
-        original_type = type(self)
-        if isinstance(item, int):
-            return original_type(
-                self.tensor[item].view(1, -1),
-                points_dim=self.points_dim,
-                attribute_dims=self.attribute_dims)
-        elif isinstance(item, tuple) and len(item) == 2:
-            if isinstance(item[1], slice):
-                start = 0 if item[1].start is None else item[1].start
-                stop = self.tensor.shape[1] if \
-                    item[1].stop is None else item[1].stop
-                step = 1 if item[1].step is None else item[1].step
-                item = list(item)
-                item[1] = list(range(start, stop, step))
-                item = tuple(item)
-            elif isinstance(item[1], int):
-                item = list(item)
-                item[1] = [item[1]]
-                item = tuple(item)
-            p = self.tensor[item[0], item[1]]
-
-            keep_dims = list(
-                set(item[1]).intersection(set(range(3, self.tensor.shape[1]))))
-            if self.attribute_dims is not None:
-                attribute_dims = self.attribute_dims.copy()
-                for key in self.attribute_dims.keys():
-                    cur_attribute_dims = attribute_dims[key]
-                    if isinstance(cur_attribute_dims, int):
-                        cur_attribute_dims = [cur_attribute_dims]
-                    intersect_attr = list(
-                        set(cur_attribute_dims).intersection(set(keep_dims)))
-                    if len(intersect_attr) == 1:
-                        attribute_dims[key] = intersect_attr[0]
-                    elif len(intersect_attr) > 1:
-                        attribute_dims[key] = intersect_attr
-                    else:
-                        attribute_dims.pop(key)
-            else:
-                attribute_dims = None
-        elif isinstance(item, (slice, np.ndarray, torch.Tensor)):
-            p = self.tensor[item]
-            attribute_dims = self.attribute_dims
-        else:
-            raise NotImplementedError(f'Invalid slice {item}!')
-
-        assert p.dim() == 2, \
-            f'Indexing on Points with {item} failed to return a matrix!'
-        return original_type(
-            p, points_dim=p.shape[1], attribute_dims=attribute_dims)
-
-    def __len__(self):
-        """int: Number of points in the current object."""
-        return self.tensor.shape[0]
-
-    def __repr__(self):
-        """str: Return a strings that describes the object."""
-        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'
-
-    @classmethod
-    def cat(cls, points_list):
-        """Concatenate a list of Points into a single Points.
-
-        Args:
-            points_list (list[:obj:`BasePoints`]): List of points.
-
-        Returns:
-            :obj:`BasePoints`: The concatenated Points.
-        """
-        assert isinstance(points_list, (list, tuple))
-        if len(points_list) == 0:
-            return cls(torch.empty(0))
-        assert all(isinstance(points, cls) for points in points_list)
-
-        # use torch.cat (v.s. layers.cat)
-        # so the returned points never share storage with input
-        cat_points = cls(
-            torch.cat([p.tensor for p in points_list], dim=0),
-            points_dim=points_list[0].tensor.shape[1],
-            attribute_dims=points_list[0].attribute_dims)
-        return cat_points
-
-    def to(self, device):
-        """Convert current points to a specific device.
-
-        Args:
-            device (str | :obj:`torch.device`): The name of the device.
-
-        Returns:
-            :obj:`BasePoints`: A new boxes object on the
-                specific device.
-        """
-        original_type = type(self)
-        return original_type(
-            self.tensor.to(device),
-            points_dim=self.points_dim,
-            attribute_dims=self.attribute_dims)
-
-    def clone(self):
-        """Clone the Points.
-
-        Returns:
-            :obj:`BasePoints`: Box object with the same properties
-                as self.
-        """
-        original_type = type(self)
-        return original_type(
-            self.tensor.clone(),
-            points_dim=self.points_dim,
-            attribute_dims=self.attribute_dims)
-
-    @property
-    def device(self):
-        """str: The device of the points are on."""
-        return self.tensor.device
-
-    def __iter__(self):
-        """Yield a point as a Tensor of shape (4,) at a time.
-
-        Returns:
-            torch.Tensor: A point of shape (4,).
-        """
-        yield from self.tensor
-
-    def new_point(self, data):
-        """Create a new point object with data.
-
-        The new point and its tensor has the similar properties
-            as self and self.tensor, respectively.
-
-        Args:
-            data (torch.Tensor | numpy.array | list): Data to be copied.
-
-        Returns:
-            :obj:`BasePoints`: A new point object with ``data``,
-                the object's other properties are similar to ``self``.
-        """
-        new_tensor = self.tensor.new_tensor(data) \
-            if not isinstance(data, torch.Tensor) else data.to(self.device)
-        original_type = type(self)
-        return original_type(
-            new_tensor,
-            points_dim=self.points_dim,
-            attribute_dims=self.attribute_dims)
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from abc import abstractmethod
+
+import numpy as np
+import torch
+
+from ..bbox.structures.utils import rotation_3d_in_axis
+
+
+class BasePoints(object):
+    """Base class for Points.
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+        points_dim (int, optional): Number of the dimension of a point.
+            Each row is (x, y, z). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the
+            meaning of extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x points_dim.
+        points_dim (int): Integer indicating the dimension of a point.
+            Each row is (x, y, z, ...).
+        attribute_dims (bool): Dictionary to indicate the meaning of extra
+            dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self, tensor, points_dim=3, attribute_dims=None):
+        if isinstance(tensor, torch.Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that
+            # does not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, points_dim)).to(
+                dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == \
+            points_dim, tensor.size()
+
+        self.tensor = tensor
+        self.points_dim = points_dim
+        self.attribute_dims = attribute_dims
+        self.rotation_axis = 0
+
+    @property
+    def coord(self):
+        """torch.Tensor: Coordinates of each point in shape (N, 3)."""
+        return self.tensor[:, :3]
+
+    @coord.setter
+    def coord(self, tensor):
+        """Set the coordinates of each point."""
+        try:
+            tensor = tensor.reshape(self.shape[0], 3)
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if not isinstance(tensor, torch.Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        self.tensor[:, :3] = tensor
+
+    @property
+    def height(self):
+        """torch.Tensor:
+            A vector with height of each point in shape (N, 1), or None."""
+        if self.attribute_dims is not None and \
+                'height' in self.attribute_dims.keys():
+            return self.tensor[:, self.attribute_dims['height']]
+        else:
+            return None
+
+    @height.setter
+    def height(self, tensor):
+        """Set the height of each point."""
+        try:
+            tensor = tensor.reshape(self.shape[0])
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if not isinstance(tensor, torch.Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        if self.attribute_dims is not None and \
+                'height' in self.attribute_dims.keys():
+            self.tensor[:, self.attribute_dims['height']] = tensor
+        else:
+            # add height attribute
+            if self.attribute_dims is None:
+                self.attribute_dims = dict()
+            attr_dim = self.shape[1]
+            self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1)
+            self.attribute_dims.update(dict(height=attr_dim))
+            self.points_dim += 1
+
+    @property
+    def color(self):
+        """torch.Tensor:
+            A vector with color of each point in shape (N, 3), or None."""
+        if self.attribute_dims is not None and \
+                'color' in self.attribute_dims.keys():
+            return self.tensor[:, self.attribute_dims['color']]
+        else:
+            return None
+
+    @color.setter
+    def color(self, tensor):
+        """Set the color of each point."""
+        try:
+            tensor = tensor.reshape(self.shape[0], 3)
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if tensor.max() >= 256 or tensor.min() < 0:
+            warnings.warn('point got color value beyond [0, 255]')
+        if not isinstance(tensor, torch.Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        if self.attribute_dims is not None and \
+                'color' in self.attribute_dims.keys():
+            self.tensor[:, self.attribute_dims['color']] = tensor
+        else:
+            # add color attribute
+            if self.attribute_dims is None:
+                self.attribute_dims = dict()
+            attr_dim = self.shape[1]
+            self.tensor = torch.cat([self.tensor, tensor], dim=1)
+            self.attribute_dims.update(
+                dict(color=[attr_dim, attr_dim + 1, attr_dim + 2]))
+            self.points_dim += 3
+
+    @property
+    def shape(self):
+        """torch.Shape: Shape of points."""
+        return self.tensor.shape
+
+    def shuffle(self):
+        """Shuffle the points.
+
+        Returns:
+            torch.Tensor: The shuffled index.
+        """
+        idx = torch.randperm(self.__len__(), device=self.tensor.device)
+        self.tensor = self.tensor[idx]
+        return idx
+
+    def rotate(self, rotation, axis=None):
+        """Rotate points with the given rotation matrix or angle.
+
+        Args:
+            rotation (float | np.ndarray | torch.Tensor): Rotation matrix
+                or angle.
+            axis (int, optional): Axis to rotate at. Defaults to None.
+        """
+        if not isinstance(rotation, torch.Tensor):
+            rotation = self.tensor.new_tensor(rotation)
+        assert rotation.shape == torch.Size([3, 3]) or \
+            rotation.numel() == 1, f'invalid rotation shape {rotation.shape}'
+
+        if axis is None:
+            axis = self.rotation_axis
+
+        if rotation.numel() == 1:
+            rotated_points, rot_mat_T = rotation_3d_in_axis(
+                self.tensor[:, :3][None], rotation, axis=axis, return_mat=True)
+            self.tensor[:, :3] = rotated_points.squeeze(0)
+            rot_mat_T = rot_mat_T.squeeze(0)
+        else:
+            # rotation.numel() == 9
+            self.tensor[:, :3] = self.tensor[:, :3] @ rotation
+            rot_mat_T = rotation
+
+        return rot_mat_T
+
+    @abstractmethod
+    def flip(self, bev_direction='horizontal'):
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+        """
+        pass
+
+    def translate(self, trans_vector):
+        """Translate points with the given translation vector.
+
+        Args:
+            trans_vector (np.ndarray, torch.Tensor): Translation
+                vector of size 3 or nx3.
+        """
+        if not isinstance(trans_vector, torch.Tensor):
+            trans_vector = self.tensor.new_tensor(trans_vector)
+        trans_vector = trans_vector.squeeze(0)
+        if trans_vector.dim() == 1:
+            assert trans_vector.shape[0] == 3
+        elif trans_vector.dim() == 2:
+            assert trans_vector.shape[0] == self.tensor.shape[0] and \
+                trans_vector.shape[1] == 3
+        else:
+            raise NotImplementedError(
+                f'Unsupported translation vector of shape {trans_vector.shape}'
+            )
+        self.tensor[:, :3] += trans_vector
+
+    def in_range_3d(self, point_range):
+        """Check whether the points are in the given range.
+
+        Args:
+            point_range (list | torch.Tensor): The range of point
+                (x_min, y_min, z_min, x_max, y_max, z_max)
+
+        Note:
+            In the original implementation of SECOND, checking whether
+            a box in the range checks whether the points are in a convex
+            polygon, we try to reduce the burden for simpler cases.
+
+        Returns:
+            torch.Tensor: A binary vector indicating whether each point is
+                inside the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > point_range[0])
+                          & (self.tensor[:, 1] > point_range[1])
+                          & (self.tensor[:, 2] > point_range[2])
+                          & (self.tensor[:, 0] < point_range[3])
+                          & (self.tensor[:, 1] < point_range[4])
+                          & (self.tensor[:, 2] < point_range[5]))
+        return in_range_flags
+
+    @property
+    def bev(self):
+        """torch.Tensor: BEV of the points in shape (N, 2)."""
+        return self.tensor[:, [0, 1]]
+
+    def in_range_bev(self, point_range):
+        """Check whether the points are in the given range.
+
+        Args:
+            point_range (list | torch.Tensor): The range of point
+                in order of (x_min, y_min, x_max, y_max).
+
+        Returns:
+            torch.Tensor: Indicating whether each point is inside
+                the reference range.
+        """
+        in_range_flags = ((self.bev[:, 0] > point_range[0])
+                          & (self.bev[:, 1] > point_range[1])
+                          & (self.bev[:, 0] < point_range[2])
+                          & (self.bev[:, 1] < point_range[3]))
+        return in_range_flags
+
+    @abstractmethod
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`CoordMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted box of the same type
+                in the `dst` mode.
+        """
+        pass
+
+    def scale(self, scale_factor):
+        """Scale the points with horizontal and vertical scaling factors.
+
+        Args:
+            scale_factors (float): Scale factors to scale the points.
+        """
+        self.tensor[:, :3] *= scale_factor
+
+    def __getitem__(self, item):
+        """
+        Note:
+            The following usage are allowed:
+            1. `new_points = points[3]`:
+                return a `Points` that contains only one point.
+            2. `new_points = points[2:10]`:
+                return a slice of points.
+            3. `new_points = points[vector]`:
+                where vector is a torch.BoolTensor with `length = len(points)`.
+                Nonzero elements in the vector will be selected.
+            4. `new_points = points[3:11, vector]`:
+                return a slice of points and attribute dims.
+            5. `new_points = points[4:12, 2]`:
+                return a slice of points with single attribute.
+            Note that the returned Points might share storage with this Points,
+            subject to Pytorch's indexing semantics.
+
+        Returns:
+            :obj:`BasePoints`: A new object of
+                :class:`BasePoints` after indexing.
+        """
+        original_type = type(self)
+        if isinstance(item, int):
+            return original_type(
+                self.tensor[item].view(1, -1),
+                points_dim=self.points_dim,
+                attribute_dims=self.attribute_dims)
+        elif isinstance(item, tuple) and len(item) == 2:
+            if isinstance(item[1], slice):
+                start = 0 if item[1].start is None else item[1].start
+                stop = self.tensor.shape[1] if \
+                    item[1].stop is None else item[1].stop
+                step = 1 if item[1].step is None else item[1].step
+                item = list(item)
+                item[1] = list(range(start, stop, step))
+                item = tuple(item)
+            elif isinstance(item[1], int):
+                item = list(item)
+                item[1] = [item[1]]
+                item = tuple(item)
+            p = self.tensor[item[0], item[1]]
+
+            keep_dims = list(
+                set(item[1]).intersection(set(range(3, self.tensor.shape[1]))))
+            if self.attribute_dims is not None:
+                attribute_dims = self.attribute_dims.copy()
+                for key in self.attribute_dims.keys():
+                    cur_attribute_dims = attribute_dims[key]
+                    if isinstance(cur_attribute_dims, int):
+                        cur_attribute_dims = [cur_attribute_dims]
+                    intersect_attr = list(
+                        set(cur_attribute_dims).intersection(set(keep_dims)))
+                    if len(intersect_attr) == 1:
+                        attribute_dims[key] = intersect_attr[0]
+                    elif len(intersect_attr) > 1:
+                        attribute_dims[key] = intersect_attr
+                    else:
+                        attribute_dims.pop(key)
+            else:
+                attribute_dims = None
+        elif isinstance(item, (slice, np.ndarray, torch.Tensor)):
+            p = self.tensor[item]
+            attribute_dims = self.attribute_dims
+        else:
+            raise NotImplementedError(f'Invalid slice {item}!')
+
+        assert p.dim() == 2, \
+            f'Indexing on Points with {item} failed to return a matrix!'
+        return original_type(
+            p, points_dim=p.shape[1], attribute_dims=attribute_dims)
+
+    def __len__(self):
+        """int: Number of points in the current object."""
+        return self.tensor.shape[0]
+
+    def __repr__(self):
+        """str: Return a strings that describes the object."""
+        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'
+
+    @classmethod
+    def cat(cls, points_list):
+        """Concatenate a list of Points into a single Points.
+
+        Args:
+            points_list (list[:obj:`BasePoints`]): List of points.
+
+        Returns:
+            :obj:`BasePoints`: The concatenated Points.
+        """
+        assert isinstance(points_list, (list, tuple))
+        if len(points_list) == 0:
+            return cls(torch.empty(0))
+        assert all(isinstance(points, cls) for points in points_list)
+
+        # use torch.cat (v.s. layers.cat)
+        # so the returned points never share storage with input
+        cat_points = cls(
+            torch.cat([p.tensor for p in points_list], dim=0),
+            points_dim=points_list[0].tensor.shape[1],
+            attribute_dims=points_list[0].attribute_dims)
+        return cat_points
+
+    def to(self, device):
+        """Convert current points to a specific device.
+
+        Args:
+            device (str | :obj:`torch.device`): The name of the device.
+
+        Returns:
+            :obj:`BasePoints`: A new boxes object on the
+                specific device.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.to(device),
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
+
+    def clone(self):
+        """Clone the Points.
+
+        Returns:
+            :obj:`BasePoints`: Box object with the same properties
+                as self.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.clone(),
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
+
+    @property
+    def device(self):
+        """str: The device of the points are on."""
+        return self.tensor.device
+
+    def __iter__(self):
+        """Yield a point as a Tensor of shape (4,) at a time.
+
+        Returns:
+            torch.Tensor: A point of shape (4,).
+        """
+        yield from self.tensor
+
+    def new_point(self, data):
+        """Create a new point object with data.
+
+        The new point and its tensor has the similar properties
+            as self and self.tensor, respectively.
+
+        Args:
+            data (torch.Tensor | numpy.array | list): Data to be copied.
+
+        Returns:
+            :obj:`BasePoints`: A new point object with ``data``,
+                the object's other properties are similar to ``self``.
+        """
+        new_tensor = self.tensor.new_tensor(data) \
+            if not isinstance(data, torch.Tensor) else data.to(self.device)
+        original_type = type(self)
+        return original_type(
+            new_tensor,
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
diff --git a/mmdet3d/core/points/cam_points.py b/mmdet3d/core/points/cam_points.py
index a57c3db..a7cfc0a 100644
--- a/mmdet3d/core/points/cam_points.py
+++ b/mmdet3d/core/points/cam_points.py
@@ -1,63 +1,63 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base_points import BasePoints
-
-
-class CameraPoints(BasePoints):
-    """Points of instances in CAM coordinates.
-
-    Args:
-        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
-        points_dim (int, optional): Number of the dimension of a point.
-            Each row is (x, y, z). Defaults to 3.
-        attribute_dims (dict, optional): Dictionary to indicate the
-            meaning of extra dimension. Defaults to None.
-
-    Attributes:
-        tensor (torch.Tensor): Float matrix of N x points_dim.
-        points_dim (int): Integer indicating the dimension of a point.
-            Each row is (x, y, z, ...).
-        attribute_dims (bool): Dictionary to indicate the meaning of extra
-            dimension. Defaults to None.
-        rotation_axis (int): Default rotation axis for points rotation.
-    """
-
-    def __init__(self, tensor, points_dim=3, attribute_dims=None):
-        super(CameraPoints, self).__init__(
-            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
-        self.rotation_axis = 1
-
-    def flip(self, bev_direction='horizontal'):
-        """Flip the points along given BEV direction.
-
-        Args:
-            bev_direction (str): Flip direction (horizontal or vertical).
-        """
-        if bev_direction == 'horizontal':
-            self.tensor[:, 0] = -self.tensor[:, 0]
-        elif bev_direction == 'vertical':
-            self.tensor[:, 2] = -self.tensor[:, 2]
-
-    @property
-    def bev(self):
-        """torch.Tensor: BEV of the points in shape (N, 2)."""
-        return self.tensor[:, [0, 2]]
-
-    def convert_to(self, dst, rt_mat=None):
-        """Convert self to ``dst`` mode.
-
-        Args:
-            dst (:obj:`CoordMode`): The target Point mode.
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from `src` coordinates to `dst` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-
-        Returns:
-            :obj:`BasePoints`: The converted point of the same type
-                in the `dst` mode.
-        """
-        from mmdet3d.core.bbox import Coord3DMode
-        return Coord3DMode.convert_point(
-            point=self, src=Coord3DMode.CAM, dst=dst, rt_mat=rt_mat)
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+
+
+class CameraPoints(BasePoints):
+    """Points of instances in CAM coordinates.
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+        points_dim (int, optional): Number of the dimension of a point.
+            Each row is (x, y, z). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the
+            meaning of extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x points_dim.
+        points_dim (int): Integer indicating the dimension of a point.
+            Each row is (x, y, z, ...).
+        attribute_dims (bool): Dictionary to indicate the meaning of extra
+            dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self, tensor, points_dim=3, attribute_dims=None):
+        super(CameraPoints, self).__init__(
+            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+        self.rotation_axis = 1
+
+    def flip(self, bev_direction='horizontal'):
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+        """
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 2] = -self.tensor[:, 2]
+
+    @property
+    def bev(self):
+        """torch.Tensor: BEV of the points in shape (N, 2)."""
+        return self.tensor[:, [0, 2]]
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`CoordMode`): The target Point mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type
+                in the `dst` mode.
+        """
+        from mmdet3d.core.bbox import Coord3DMode
+        return Coord3DMode.convert_point(
+            point=self, src=Coord3DMode.CAM, dst=dst, rt_mat=rt_mat)
diff --git a/mmdet3d/core/points/depth_points.py b/mmdet3d/core/points/depth_points.py
index 2d9221f..c0e1547 100644
--- a/mmdet3d/core/points/depth_points.py
+++ b/mmdet3d/core/points/depth_points.py
@@ -1,58 +1,58 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base_points import BasePoints
-
-
-class DepthPoints(BasePoints):
-    """Points of instances in DEPTH coordinates.
-
-    Args:
-        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
-        points_dim (int, optional): Number of the dimension of a point.
-            Each row is (x, y, z). Defaults to 3.
-        attribute_dims (dict, optional): Dictionary to indicate the
-            meaning of extra dimension. Defaults to None.
-
-    Attributes:
-        tensor (torch.Tensor): Float matrix of N x points_dim.
-        points_dim (int): Integer indicating the dimension of a point.
-            Each row is (x, y, z, ...).
-        attribute_dims (bool): Dictionary to indicate the meaning of extra
-            dimension. Defaults to None.
-        rotation_axis (int): Default rotation axis for points rotation.
-    """
-
-    def __init__(self, tensor, points_dim=3, attribute_dims=None):
-        super(DepthPoints, self).__init__(
-            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
-        self.rotation_axis = 2
-
-    def flip(self, bev_direction='horizontal'):
-        """Flip the points along given BEV direction.
-
-        Args:
-            bev_direction (str): Flip direction (horizontal or vertical).
-        """
-        if bev_direction == 'horizontal':
-            self.tensor[:, 0] = -self.tensor[:, 0]
-        elif bev_direction == 'vertical':
-            self.tensor[:, 1] = -self.tensor[:, 1]
-
-    def convert_to(self, dst, rt_mat=None):
-        """Convert self to ``dst`` mode.
-
-        Args:
-            dst (:obj:`CoordMode`): The target Point mode.
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from `src` coordinates to `dst` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-
-        Returns:
-            :obj:`BasePoints`: The converted point of the same type
-                in the `dst` mode.
-        """
-        from mmdet3d.core.bbox import Coord3DMode
-        return Coord3DMode.convert_point(
-            point=self, src=Coord3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+
+
+class DepthPoints(BasePoints):
+    """Points of instances in DEPTH coordinates.
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+        points_dim (int, optional): Number of the dimension of a point.
+            Each row is (x, y, z). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the
+            meaning of extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x points_dim.
+        points_dim (int): Integer indicating the dimension of a point.
+            Each row is (x, y, z, ...).
+        attribute_dims (bool): Dictionary to indicate the meaning of extra
+            dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self, tensor, points_dim=3, attribute_dims=None):
+        super(DepthPoints, self).__init__(
+            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+        self.rotation_axis = 2
+
+    def flip(self, bev_direction='horizontal'):
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+        """
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 1] = -self.tensor[:, 1]
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`CoordMode`): The target Point mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type
+                in the `dst` mode.
+        """
+        from mmdet3d.core.bbox import Coord3DMode
+        return Coord3DMode.convert_point(
+            point=self, src=Coord3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
diff --git a/mmdet3d/core/points/lidar_points.py b/mmdet3d/core/points/lidar_points.py
index ff4f57a..4edf26a 100644
--- a/mmdet3d/core/points/lidar_points.py
+++ b/mmdet3d/core/points/lidar_points.py
@@ -1,58 +1,58 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base_points import BasePoints
-
-
-class LiDARPoints(BasePoints):
-    """Points of instances in LIDAR coordinates.
-
-    Args:
-        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
-        points_dim (int, optional): Number of the dimension of a point.
-            Each row is (x, y, z). Defaults to 3.
-        attribute_dims (dict, optional): Dictionary to indicate the
-            meaning of extra dimension. Defaults to None.
-
-    Attributes:
-        tensor (torch.Tensor): Float matrix of N x points_dim.
-        points_dim (int): Integer indicating the dimension of a point.
-            Each row is (x, y, z, ...).
-        attribute_dims (bool): Dictionary to indicate the meaning of extra
-            dimension. Defaults to None.
-        rotation_axis (int): Default rotation axis for points rotation.
-    """
-
-    def __init__(self, tensor, points_dim=3, attribute_dims=None):
-        super(LiDARPoints, self).__init__(
-            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
-        self.rotation_axis = 2
-
-    def flip(self, bev_direction='horizontal'):
-        """Flip the points along given BEV direction.
-
-        Args:
-            bev_direction (str): Flip direction (horizontal or vertical).
-        """
-        if bev_direction == 'horizontal':
-            self.tensor[:, 1] = -self.tensor[:, 1]
-        elif bev_direction == 'vertical':
-            self.tensor[:, 0] = -self.tensor[:, 0]
-
-    def convert_to(self, dst, rt_mat=None):
-        """Convert self to ``dst`` mode.
-
-        Args:
-            dst (:obj:`CoordMode`): The target Point mode.
-            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
-                translation matrix between different coordinates.
-                Defaults to None.
-                The conversion from `src` coordinates to `dst` coordinates
-                usually comes along the change of sensors, e.g., from camera
-                to LiDAR. This requires a transformation matrix.
-
-        Returns:
-            :obj:`BasePoints`: The converted point of the same type
-                in the `dst` mode.
-        """
-        from mmdet3d.core.bbox import Coord3DMode
-        return Coord3DMode.convert_point(
-            point=self, src=Coord3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+
+
+class LiDARPoints(BasePoints):
+    """Points of instances in LIDAR coordinates.
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+        points_dim (int, optional): Number of the dimension of a point.
+            Each row is (x, y, z). Defaults to 3.
+        attribute_dims (dict, optional): Dictionary to indicate the
+            meaning of extra dimension. Defaults to None.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x points_dim.
+        points_dim (int): Integer indicating the dimension of a point.
+            Each row is (x, y, z, ...).
+        attribute_dims (bool): Dictionary to indicate the meaning of extra
+            dimension. Defaults to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self, tensor, points_dim=3, attribute_dims=None):
+        super(LiDARPoints, self).__init__(
+            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+        self.rotation_axis = 2
+
+    def flip(self, bev_direction='horizontal'):
+        """Flip the points along given BEV direction.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+        """
+        if bev_direction == 'horizontal':
+            self.tensor[:, 1] = -self.tensor[:, 1]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`CoordMode`): The target Point mode.
+            rt_mat (np.ndarray | torch.Tensor, optional): The rotation and
+                translation matrix between different coordinates.
+                Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type
+                in the `dst` mode.
+        """
+        from mmdet3d.core.bbox import Coord3DMode
+        return Coord3DMode.convert_point(
+            point=self, src=Coord3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
diff --git a/mmdet3d/core/post_processing/__init__.py b/mmdet3d/core/post_processing/__init__.py
index 2fb534e..42eb5bf 100644
--- a/mmdet3d/core/post_processing/__init__.py
+++ b/mmdet3d/core/post_processing/__init__.py
@@ -1,14 +1,14 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.core.post_processing import (merge_aug_bboxes, merge_aug_masks,
-                                        merge_aug_proposals, merge_aug_scores,
-                                        multiclass_nms)
-from .box3d_nms import (aligned_3d_nms, box3d_multiclass_nms, circle_nms,
-                        nms_bev, nms_normal_bev)
-from .merge_augs import merge_aug_bboxes_3d
-
-__all__ = [
-    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
-    'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms',
-    'aligned_3d_nms', 'merge_aug_bboxes_3d', 'circle_nms', 'nms_bev',
-    'nms_normal_bev'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.core.post_processing import (merge_aug_bboxes, merge_aug_masks,
+                                        merge_aug_proposals, merge_aug_scores,
+                                        multiclass_nms)
+from .box3d_nms import (aligned_3d_nms, box3d_multiclass_nms, circle_nms,
+                        nms_bev, nms_normal_bev)
+from .merge_augs import merge_aug_bboxes_3d
+
+__all__ = [
+    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
+    'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms',
+    'aligned_3d_nms', 'merge_aug_bboxes_3d', 'circle_nms', 'nms_bev',
+    'nms_normal_bev'
+]
diff --git a/mmdet3d/core/post_processing/box3d_nms.py b/mmdet3d/core/post_processing/box3d_nms.py
index 2d42085..17850f4 100644
--- a/mmdet3d/core/post_processing/box3d_nms.py
+++ b/mmdet3d/core/post_processing/box3d_nms.py
@@ -1,288 +1,288 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numba
-import numpy as np
-import torch
-from mmcv.ops import nms, nms_rotated
-
-
-def box3d_multiclass_nms(mlvl_bboxes,
-                         mlvl_bboxes_for_nms,
-                         mlvl_scores,
-                         score_thr,
-                         max_num,
-                         cfg,
-                         mlvl_dir_scores=None,
-                         mlvl_attr_scores=None,
-                         mlvl_bboxes2d=None):
-    """Multi-class NMS for 3D boxes. The IoU used for NMS is defined as the 2D
-    IoU between BEV boxes.
-
-    Args:
-        mlvl_bboxes (torch.Tensor): Multi-level boxes with shape (N, M).
-            M is the dimensions of boxes.
-        mlvl_bboxes_for_nms (torch.Tensor): Multi-level boxes with shape
-            (N, 5) ([x1, y1, x2, y2, ry]). N is the number of boxes.
-            The coordinate system of the BEV boxes is counterclockwise.
-        mlvl_scores (torch.Tensor): Multi-level boxes with shape
-            (N, C + 1). N is the number of boxes. C is the number of classes.
-        score_thr (float): Score threshold to filter boxes with low
-            confidence.
-        max_num (int): Maximum number of boxes will be kept.
-        cfg (dict): Configuration dict of NMS.
-        mlvl_dir_scores (torch.Tensor, optional): Multi-level scores
-            of direction classifier. Defaults to None.
-        mlvl_attr_scores (torch.Tensor, optional): Multi-level scores
-            of attribute classifier. Defaults to None.
-        mlvl_bboxes2d (torch.Tensor, optional): Multi-level 2D bounding
-            boxes. Defaults to None.
-
-    Returns:
-        tuple[torch.Tensor]: Return results after nms, including 3D
-            bounding boxes, scores, labels, direction scores, attribute
-            scores (optional) and 2D bounding boxes (optional).
-    """
-    # do multi class nms
-    # the fg class id range: [0, num_classes-1]
-    num_classes = mlvl_scores.shape[1] - 1
-    bboxes = []
-    scores = []
-    labels = []
-    dir_scores = []
-    attr_scores = []
-    bboxes2d = []
-    for i in range(0, num_classes):
-        # get bboxes and scores of this class
-        cls_inds = mlvl_scores[:, i] > score_thr
-        if not cls_inds.any():
-            continue
-
-        _scores = mlvl_scores[cls_inds, i]
-        _bboxes_for_nms = mlvl_bboxes_for_nms[cls_inds, :]
-
-        if cfg.use_rotate_nms:
-            nms_func = nms_bev
-        else:
-            nms_func = nms_normal_bev
-
-        selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr)
-        _mlvl_bboxes = mlvl_bboxes[cls_inds, :]
-        bboxes.append(_mlvl_bboxes[selected])
-        scores.append(_scores[selected])
-        cls_label = mlvl_bboxes.new_full((len(selected), ),
-                                         i,
-                                         dtype=torch.long)
-        labels.append(cls_label)
-
-        if mlvl_dir_scores is not None:
-            _mlvl_dir_scores = mlvl_dir_scores[cls_inds]
-            dir_scores.append(_mlvl_dir_scores[selected])
-        if mlvl_attr_scores is not None:
-            _mlvl_attr_scores = mlvl_attr_scores[cls_inds]
-            attr_scores.append(_mlvl_attr_scores[selected])
-        if mlvl_bboxes2d is not None:
-            _mlvl_bboxes2d = mlvl_bboxes2d[cls_inds]
-            bboxes2d.append(_mlvl_bboxes2d[selected])
-
-    if bboxes:
-        bboxes = torch.cat(bboxes, dim=0)
-        scores = torch.cat(scores, dim=0)
-        labels = torch.cat(labels, dim=0)
-        if mlvl_dir_scores is not None:
-            dir_scores = torch.cat(dir_scores, dim=0)
-        if mlvl_attr_scores is not None:
-            attr_scores = torch.cat(attr_scores, dim=0)
-        if mlvl_bboxes2d is not None:
-            bboxes2d = torch.cat(bboxes2d, dim=0)
-        if bboxes.shape[0] > max_num:
-            _, inds = scores.sort(descending=True)
-            inds = inds[:max_num]
-            bboxes = bboxes[inds, :]
-            labels = labels[inds]
-            scores = scores[inds]
-            if mlvl_dir_scores is not None:
-                dir_scores = dir_scores[inds]
-            if mlvl_attr_scores is not None:
-                attr_scores = attr_scores[inds]
-            if mlvl_bboxes2d is not None:
-                bboxes2d = bboxes2d[inds]
-    else:
-        bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1)))
-        scores = mlvl_scores.new_zeros((0, ))
-        labels = mlvl_scores.new_zeros((0, ), dtype=torch.long)
-        if mlvl_dir_scores is not None:
-            dir_scores = mlvl_scores.new_zeros((0, ))
-        if mlvl_attr_scores is not None:
-            attr_scores = mlvl_scores.new_zeros((0, ))
-        if mlvl_bboxes2d is not None:
-            bboxes2d = mlvl_scores.new_zeros((0, 4))
-
-    results = (bboxes, scores, labels)
-
-    if mlvl_dir_scores is not None:
-        results = results + (dir_scores, )
-    if mlvl_attr_scores is not None:
-        results = results + (attr_scores, )
-    if mlvl_bboxes2d is not None:
-        results = results + (bboxes2d, )
-
-    return results
-
-
-def aligned_3d_nms(boxes, scores, classes, thresh):
-    """3D NMS for aligned boxes.
-
-    Args:
-        boxes (torch.Tensor): Aligned box with shape [n, 6].
-        scores (torch.Tensor): Scores of each box.
-        classes (torch.Tensor): Class of each box.
-        thresh (float): IoU threshold for nms.
-
-    Returns:
-        torch.Tensor: Indices of selected boxes.
-    """
-    x1 = boxes[:, 0]
-    y1 = boxes[:, 1]
-    z1 = boxes[:, 2]
-    x2 = boxes[:, 3]
-    y2 = boxes[:, 4]
-    z2 = boxes[:, 5]
-    area = (x2 - x1) * (y2 - y1) * (z2 - z1)
-    zero = boxes.new_zeros(1, )
-
-    score_sorted = torch.argsort(scores)
-    pick = []
-    while (score_sorted.shape[0] != 0):
-        last = score_sorted.shape[0]
-        i = score_sorted[-1]
-        pick.append(i)
-
-        xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]])
-        yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]])
-        zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]])
-        xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]])
-        yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]])
-        zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]])
-        classes1 = classes[i]
-        classes2 = classes[score_sorted[:last - 1]]
-        inter_l = torch.max(zero, xx2 - xx1)
-        inter_w = torch.max(zero, yy2 - yy1)
-        inter_h = torch.max(zero, zz2 - zz1)
-
-        inter = inter_l * inter_w * inter_h
-        iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter)
-        iou = iou * (classes1 == classes2).float()
-        score_sorted = score_sorted[torch.nonzero(
-            iou <= thresh, as_tuple=False).flatten()]
-
-    indices = boxes.new_tensor(pick, dtype=torch.long)
-    return indices
-
-
-@numba.jit(nopython=True)
-def circle_nms(dets, thresh, post_max_size=83):
-    """Circular NMS.
-
-    An object is only counted as positive if no other center
-    with a higher confidence exists within a radius r using a
-    bird-eye view distance metric.
-
-    Args:
-        dets (torch.Tensor): Detection results with the shape of [N, 3].
-        thresh (float): Value of threshold.
-        post_max_size (int, optional): Max number of prediction to be kept.
-            Defaults to 83.
-
-    Returns:
-        torch.Tensor: Indexes of the detections to be kept.
-    """
-    x1 = dets[:, 0]
-    y1 = dets[:, 1]
-    scores = dets[:, 2]
-    order = scores.argsort()[::-1].astype(np.int32)  # highest->lowest
-    ndets = dets.shape[0]
-    suppressed = np.zeros((ndets), dtype=np.int32)
-    keep = []
-    for _i in range(ndets):
-        i = order[_i]  # start with highest score box
-        if suppressed[
-                i] == 1:  # if any box have enough iou with this, remove it
-            continue
-        keep.append(i)
-        for _j in range(_i + 1, ndets):
-            j = order[_j]
-            if suppressed[j] == 1:
-                continue
-            # calculate center distance between i and j box
-            dist = (x1[i] - x1[j])**2 + (y1[i] - y1[j])**2
-
-            # ovr = inter / areas[j]
-            if dist <= thresh:
-                suppressed[j] = 1
-
-    if post_max_size < len(keep):
-        return keep[:post_max_size]
-
-    return keep
-
-
-# This function duplicates functionality of mmcv.ops.iou_3d.nms_bev
-# from mmcv<=1.5, but using cuda ops from mmcv.ops.nms.nms_rotated.
-# Nms api will be unified in mmdetection3d one day.
-def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None):
-    """NMS function GPU implementation (for BEV boxes). The overlap of two
-    boxes for IoU calculation is defined as the exact overlapping area of the
-    two boxes. In this function, one can also set ``pre_max_size`` and
-    ``post_max_size``.
-
-    Args:
-        boxes (torch.Tensor): Input boxes with the shape of [N, 5]
-            ([x1, y1, x2, y2, ry]).
-        scores (torch.Tensor): Scores of boxes with the shape of [N].
-        thresh (float): Overlap threshold of NMS.
-        pre_max_size (int, optional): Max size of boxes before NMS.
-            Default: None.
-        post_max_size (int, optional): Max size of boxes after NMS.
-            Default: None.
-
-    Returns:
-        torch.Tensor: Indexes after NMS.
-    """
-    assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]'
-    order = scores.sort(0, descending=True)[1]
-    if pre_max_size is not None:
-        order = order[:pre_max_size]
-    boxes = boxes[order].contiguous()
-    scores = scores[order]
-
-    # xyxyr -> back to xywhr
-    # note: better skip this step before nms_bev call in the future
-    boxes = torch.stack(
-        ((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2,
-         boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1], boxes[:, 4]),
-        dim=-1)
-
-    keep = nms_rotated(boxes, scores, thresh)[1]
-    keep = order[keep]
-    if post_max_size is not None:
-        keep = keep[:post_max_size]
-    return keep
-
-
-# This function duplicates functionality of mmcv.ops.iou_3d.nms_normal_bev
-# from mmcv<=1.5, but using cuda ops from mmcv.ops.nms.nms.
-# Nms api will be unified in mmdetection3d one day.
-def nms_normal_bev(boxes, scores, thresh):
-    """Normal NMS function GPU implementation (for BEV boxes). The overlap of
-    two boxes for IoU calculation is defined as the exact overlapping area of
-    the two boxes WITH their yaw angle set to 0.
-
-    Args:
-        boxes (torch.Tensor): Input boxes with shape (N, 5).
-        scores (torch.Tensor): Scores of predicted boxes with shape (N).
-        thresh (float): Overlap threshold of NMS.
-
-    Returns:
-        torch.Tensor: Remaining indices with scores in descending order.
-    """
-    assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]'
-    return nms(boxes[:, :-1], scores, thresh)[1]
+# Copyright (c) OpenMMLab. All rights reserved.
+import numba
+import numpy as np
+import torch
+from mmcv.ops import nms, nms_rotated
+
+
+def box3d_multiclass_nms(mlvl_bboxes,
+                         mlvl_bboxes_for_nms,
+                         mlvl_scores,
+                         score_thr,
+                         max_num,
+                         cfg,
+                         mlvl_dir_scores=None,
+                         mlvl_attr_scores=None,
+                         mlvl_bboxes2d=None):
+    """Multi-class NMS for 3D boxes. The IoU used for NMS is defined as the 2D
+    IoU between BEV boxes.
+
+    Args:
+        mlvl_bboxes (torch.Tensor): Multi-level boxes with shape (N, M).
+            M is the dimensions of boxes.
+        mlvl_bboxes_for_nms (torch.Tensor): Multi-level boxes with shape
+            (N, 5) ([x1, y1, x2, y2, ry]). N is the number of boxes.
+            The coordinate system of the BEV boxes is counterclockwise.
+        mlvl_scores (torch.Tensor): Multi-level boxes with shape
+            (N, C + 1). N is the number of boxes. C is the number of classes.
+        score_thr (float): Score threshold to filter boxes with low
+            confidence.
+        max_num (int): Maximum number of boxes will be kept.
+        cfg (dict): Configuration dict of NMS.
+        mlvl_dir_scores (torch.Tensor, optional): Multi-level scores
+            of direction classifier. Defaults to None.
+        mlvl_attr_scores (torch.Tensor, optional): Multi-level scores
+            of attribute classifier. Defaults to None.
+        mlvl_bboxes2d (torch.Tensor, optional): Multi-level 2D bounding
+            boxes. Defaults to None.
+
+    Returns:
+        tuple[torch.Tensor]: Return results after nms, including 3D
+            bounding boxes, scores, labels, direction scores, attribute
+            scores (optional) and 2D bounding boxes (optional).
+    """
+    # do multi class nms
+    # the fg class id range: [0, num_classes-1]
+    num_classes = mlvl_scores.shape[1] - 1
+    bboxes = []
+    scores = []
+    labels = []
+    dir_scores = []
+    attr_scores = []
+    bboxes2d = []
+    for i in range(0, num_classes):
+        # get bboxes and scores of this class
+        cls_inds = mlvl_scores[:, i] > score_thr
+        if not cls_inds.any():
+            continue
+
+        _scores = mlvl_scores[cls_inds, i]
+        _bboxes_for_nms = mlvl_bboxes_for_nms[cls_inds, :]
+
+        if cfg.use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+
+        selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr)
+        _mlvl_bboxes = mlvl_bboxes[cls_inds, :]
+        bboxes.append(_mlvl_bboxes[selected])
+        scores.append(_scores[selected])
+        cls_label = mlvl_bboxes.new_full((len(selected), ),
+                                         i,
+                                         dtype=torch.long)
+        labels.append(cls_label)
+
+        if mlvl_dir_scores is not None:
+            _mlvl_dir_scores = mlvl_dir_scores[cls_inds]
+            dir_scores.append(_mlvl_dir_scores[selected])
+        if mlvl_attr_scores is not None:
+            _mlvl_attr_scores = mlvl_attr_scores[cls_inds]
+            attr_scores.append(_mlvl_attr_scores[selected])
+        if mlvl_bboxes2d is not None:
+            _mlvl_bboxes2d = mlvl_bboxes2d[cls_inds]
+            bboxes2d.append(_mlvl_bboxes2d[selected])
+
+    if bboxes:
+        bboxes = torch.cat(bboxes, dim=0)
+        scores = torch.cat(scores, dim=0)
+        labels = torch.cat(labels, dim=0)
+        if mlvl_dir_scores is not None:
+            dir_scores = torch.cat(dir_scores, dim=0)
+        if mlvl_attr_scores is not None:
+            attr_scores = torch.cat(attr_scores, dim=0)
+        if mlvl_bboxes2d is not None:
+            bboxes2d = torch.cat(bboxes2d, dim=0)
+        if bboxes.shape[0] > max_num:
+            _, inds = scores.sort(descending=True)
+            inds = inds[:max_num]
+            bboxes = bboxes[inds, :]
+            labels = labels[inds]
+            scores = scores[inds]
+            if mlvl_dir_scores is not None:
+                dir_scores = dir_scores[inds]
+            if mlvl_attr_scores is not None:
+                attr_scores = attr_scores[inds]
+            if mlvl_bboxes2d is not None:
+                bboxes2d = bboxes2d[inds]
+    else:
+        bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1)))
+        scores = mlvl_scores.new_zeros((0, ))
+        labels = mlvl_scores.new_zeros((0, ), dtype=torch.long)
+        if mlvl_dir_scores is not None:
+            dir_scores = mlvl_scores.new_zeros((0, ))
+        if mlvl_attr_scores is not None:
+            attr_scores = mlvl_scores.new_zeros((0, ))
+        if mlvl_bboxes2d is not None:
+            bboxes2d = mlvl_scores.new_zeros((0, 4))
+
+    results = (bboxes, scores, labels)
+
+    if mlvl_dir_scores is not None:
+        results = results + (dir_scores, )
+    if mlvl_attr_scores is not None:
+        results = results + (attr_scores, )
+    if mlvl_bboxes2d is not None:
+        results = results + (bboxes2d, )
+
+    return results
+
+
+def aligned_3d_nms(boxes, scores, classes, thresh):
+    """3D NMS for aligned boxes.
+
+    Args:
+        boxes (torch.Tensor): Aligned box with shape [n, 6].
+        scores (torch.Tensor): Scores of each box.
+        classes (torch.Tensor): Class of each box.
+        thresh (float): IoU threshold for nms.
+
+    Returns:
+        torch.Tensor: Indices of selected boxes.
+    """
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    z1 = boxes[:, 2]
+    x2 = boxes[:, 3]
+    y2 = boxes[:, 4]
+    z2 = boxes[:, 5]
+    area = (x2 - x1) * (y2 - y1) * (z2 - z1)
+    zero = boxes.new_zeros(1, )
+
+    score_sorted = torch.argsort(scores)
+    pick = []
+    while (score_sorted.shape[0] != 0):
+        last = score_sorted.shape[0]
+        i = score_sorted[-1]
+        pick.append(i)
+
+        xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]])
+        yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]])
+        zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]])
+        xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]])
+        yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]])
+        zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]])
+        classes1 = classes[i]
+        classes2 = classes[score_sorted[:last - 1]]
+        inter_l = torch.max(zero, xx2 - xx1)
+        inter_w = torch.max(zero, yy2 - yy1)
+        inter_h = torch.max(zero, zz2 - zz1)
+
+        inter = inter_l * inter_w * inter_h
+        iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter)
+        iou = iou * (classes1 == classes2).float()
+        score_sorted = score_sorted[torch.nonzero(
+            iou <= thresh, as_tuple=False).flatten()]
+
+    indices = boxes.new_tensor(pick, dtype=torch.long)
+    return indices
+
+
+@numba.jit(nopython=True)
+def circle_nms(dets, thresh, post_max_size=83):
+    """Circular NMS.
+
+    An object is only counted as positive if no other center
+    with a higher confidence exists within a radius r using a
+    bird-eye view distance metric.
+
+    Args:
+        dets (torch.Tensor): Detection results with the shape of [N, 3].
+        thresh (float): Value of threshold.
+        post_max_size (int, optional): Max number of prediction to be kept.
+            Defaults to 83.
+
+    Returns:
+        torch.Tensor: Indexes of the detections to be kept.
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    scores = dets[:, 2]
+    order = scores.argsort()[::-1].astype(np.int32)  # highest->lowest
+    ndets = dets.shape[0]
+    suppressed = np.zeros((ndets), dtype=np.int32)
+    keep = []
+    for _i in range(ndets):
+        i = order[_i]  # start with highest score box
+        if suppressed[
+                i] == 1:  # if any box have enough iou with this, remove it
+            continue
+        keep.append(i)
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            # calculate center distance between i and j box
+            dist = (x1[i] - x1[j])**2 + (y1[i] - y1[j])**2
+
+            # ovr = inter / areas[j]
+            if dist <= thresh:
+                suppressed[j] = 1
+
+    if post_max_size < len(keep):
+        return keep[:post_max_size]
+
+    return keep
+
+
+# This function duplicates functionality of mmcv.ops.iou_3d.nms_bev
+# from mmcv<=1.5, but using cuda ops from mmcv.ops.nms.nms_rotated.
+# Nms api will be unified in mmdetection3d one day.
+def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None):
+    """NMS function GPU implementation (for BEV boxes). The overlap of two
+    boxes for IoU calculation is defined as the exact overlapping area of the
+    two boxes. In this function, one can also set ``pre_max_size`` and
+    ``post_max_size``.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with the shape of [N, 5]
+            ([x1, y1, x2, y2, ry]).
+        scores (torch.Tensor): Scores of boxes with the shape of [N].
+        thresh (float): Overlap threshold of NMS.
+        pre_max_size (int, optional): Max size of boxes before NMS.
+            Default: None.
+        post_max_size (int, optional): Max size of boxes after NMS.
+            Default: None.
+
+    Returns:
+        torch.Tensor: Indexes after NMS.
+    """
+    assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]'
+    order = scores.sort(0, descending=True)[1]
+    if pre_max_size is not None:
+        order = order[:pre_max_size]
+    boxes = boxes[order].contiguous()
+    scores = scores[order]
+
+    # xyxyr -> back to xywhr
+    # note: better skip this step before nms_bev call in the future
+    boxes = torch.stack(
+        ((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2,
+         boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1], boxes[:, 4]),
+        dim=-1)
+
+    keep = nms_rotated(boxes, scores, thresh)[1]
+    keep = order[keep]
+    if post_max_size is not None:
+        keep = keep[:post_max_size]
+    return keep
+
+
+# This function duplicates functionality of mmcv.ops.iou_3d.nms_normal_bev
+# from mmcv<=1.5, but using cuda ops from mmcv.ops.nms.nms.
+# Nms api will be unified in mmdetection3d one day.
+def nms_normal_bev(boxes, scores, thresh):
+    """Normal NMS function GPU implementation (for BEV boxes). The overlap of
+    two boxes for IoU calculation is defined as the exact overlapping area of
+    the two boxes WITH their yaw angle set to 0.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with shape (N, 5).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N).
+        thresh (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Remaining indices with scores in descending order.
+    """
+    assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]'
+    return nms(boxes[:, :-1], scores, thresh)[1]
diff --git a/mmdet3d/core/post_processing/merge_augs.py b/mmdet3d/core/post_processing/merge_augs.py
index 0e20dcd..981430c 100644
--- a/mmdet3d/core/post_processing/merge_augs.py
+++ b/mmdet3d/core/post_processing/merge_augs.py
@@ -1,92 +1,92 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet3d.core.post_processing import nms_bev, nms_normal_bev
-from ..bbox import bbox3d2result, bbox3d_mapping_back, xywhr2xyxyr
-
-
-def merge_aug_bboxes_3d(aug_results, img_metas, test_cfg):
-    """Merge augmented detection 3D bboxes and scores.
-
-    Args:
-        aug_results (list[dict]): The dict of detection results.
-            The dict contains the following keys
-
-            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
-            - scores_3d (torch.Tensor): Detection scores.
-            - labels_3d (torch.Tensor): Predicted box labels.
-        img_metas (list[dict]): Meta information of each sample.
-        test_cfg (dict): Test config.
-
-    Returns:
-        dict: Bounding boxes results in cpu mode, containing merged results.
-
-            - boxes_3d (:obj:`BaseInstance3DBoxes`): Merged detection bbox.
-            - scores_3d (torch.Tensor): Merged detection scores.
-            - labels_3d (torch.Tensor): Merged predicted box labels.
-    """
-
-    assert len(aug_results) == len(img_metas), \
-        '"aug_results" should have the same length as "img_metas", got len(' \
-        f'aug_results)={len(aug_results)} and len(img_metas)={len(img_metas)}'
-
-    recovered_bboxes = []
-    recovered_scores = []
-    recovered_labels = []
-
-    for bboxes, img_info in zip(aug_results, img_metas):
-        scale_factor = img_info[0]['pcd_scale_factor']
-        pcd_horizontal_flip = img_info[0]['pcd_horizontal_flip']
-        pcd_vertical_flip = img_info[0]['pcd_vertical_flip']
-        recovered_scores.append(bboxes['scores_3d'])
-        recovered_labels.append(bboxes['labels_3d'])
-        bboxes = bbox3d_mapping_back(bboxes['boxes_3d'], scale_factor,
-                                     pcd_horizontal_flip, pcd_vertical_flip)
-        recovered_bboxes.append(bboxes)
-
-    aug_bboxes = recovered_bboxes[0].cat(recovered_bboxes)
-    aug_bboxes_for_nms = xywhr2xyxyr(aug_bboxes.bev)
-    aug_scores = torch.cat(recovered_scores, dim=0)
-    aug_labels = torch.cat(recovered_labels, dim=0)
-
-    # TODO: use a more elegent way to deal with nms
-    if test_cfg.use_rotate_nms:
-        nms_func = nms_bev
-    else:
-        nms_func = nms_normal_bev
-
-    merged_bboxes = []
-    merged_scores = []
-    merged_labels = []
-
-    # Apply multi-class nms when merge bboxes
-    if len(aug_labels) == 0:
-        return bbox3d2result(aug_bboxes, aug_scores, aug_labels)
-
-    for class_id in range(torch.max(aug_labels).item() + 1):
-        class_inds = (aug_labels == class_id)
-        bboxes_i = aug_bboxes[class_inds]
-        bboxes_nms_i = aug_bboxes_for_nms[class_inds, :]
-        scores_i = aug_scores[class_inds]
-        labels_i = aug_labels[class_inds]
-        if len(bboxes_nms_i) == 0:
-            continue
-        selected = nms_func(bboxes_nms_i, scores_i, test_cfg.nms_thr)
-
-        merged_bboxes.append(bboxes_i[selected, :])
-        merged_scores.append(scores_i[selected])
-        merged_labels.append(labels_i[selected])
-
-    merged_bboxes = merged_bboxes[0].cat(merged_bboxes)
-    merged_scores = torch.cat(merged_scores, dim=0)
-    merged_labels = torch.cat(merged_labels, dim=0)
-
-    _, order = merged_scores.sort(0, descending=True)
-    num = min(test_cfg.max_num, len(aug_bboxes))
-    order = order[:num]
-
-    merged_bboxes = merged_bboxes[order]
-    merged_scores = merged_scores[order]
-    merged_labels = merged_labels[order]
-
-    return bbox3d2result(merged_bboxes, merged_scores, merged_labels)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.core.post_processing import nms_bev, nms_normal_bev
+from ..bbox import bbox3d2result, bbox3d_mapping_back, xywhr2xyxyr
+
+
+def merge_aug_bboxes_3d(aug_results, img_metas, test_cfg):
+    """Merge augmented detection 3D bboxes and scores.
+
+    Args:
+        aug_results (list[dict]): The dict of detection results.
+            The dict contains the following keys
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+        img_metas (list[dict]): Meta information of each sample.
+        test_cfg (dict): Test config.
+
+    Returns:
+        dict: Bounding boxes results in cpu mode, containing merged results.
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Merged detection bbox.
+            - scores_3d (torch.Tensor): Merged detection scores.
+            - labels_3d (torch.Tensor): Merged predicted box labels.
+    """
+
+    assert len(aug_results) == len(img_metas), \
+        '"aug_results" should have the same length as "img_metas", got len(' \
+        f'aug_results)={len(aug_results)} and len(img_metas)={len(img_metas)}'
+
+    recovered_bboxes = []
+    recovered_scores = []
+    recovered_labels = []
+
+    for bboxes, img_info in zip(aug_results, img_metas):
+        scale_factor = img_info[0]['pcd_scale_factor']
+        pcd_horizontal_flip = img_info[0]['pcd_horizontal_flip']
+        pcd_vertical_flip = img_info[0]['pcd_vertical_flip']
+        recovered_scores.append(bboxes['scores_3d'])
+        recovered_labels.append(bboxes['labels_3d'])
+        bboxes = bbox3d_mapping_back(bboxes['boxes_3d'], scale_factor,
+                                     pcd_horizontal_flip, pcd_vertical_flip)
+        recovered_bboxes.append(bboxes)
+
+    aug_bboxes = recovered_bboxes[0].cat(recovered_bboxes)
+    aug_bboxes_for_nms = xywhr2xyxyr(aug_bboxes.bev)
+    aug_scores = torch.cat(recovered_scores, dim=0)
+    aug_labels = torch.cat(recovered_labels, dim=0)
+
+    # TODO: use a more elegent way to deal with nms
+    if test_cfg.use_rotate_nms:
+        nms_func = nms_bev
+    else:
+        nms_func = nms_normal_bev
+
+    merged_bboxes = []
+    merged_scores = []
+    merged_labels = []
+
+    # Apply multi-class nms when merge bboxes
+    if len(aug_labels) == 0:
+        return bbox3d2result(aug_bboxes, aug_scores, aug_labels)
+
+    for class_id in range(torch.max(aug_labels).item() + 1):
+        class_inds = (aug_labels == class_id)
+        bboxes_i = aug_bboxes[class_inds]
+        bboxes_nms_i = aug_bboxes_for_nms[class_inds, :]
+        scores_i = aug_scores[class_inds]
+        labels_i = aug_labels[class_inds]
+        if len(bboxes_nms_i) == 0:
+            continue
+        selected = nms_func(bboxes_nms_i, scores_i, test_cfg.nms_thr)
+
+        merged_bboxes.append(bboxes_i[selected, :])
+        merged_scores.append(scores_i[selected])
+        merged_labels.append(labels_i[selected])
+
+    merged_bboxes = merged_bboxes[0].cat(merged_bboxes)
+    merged_scores = torch.cat(merged_scores, dim=0)
+    merged_labels = torch.cat(merged_labels, dim=0)
+
+    _, order = merged_scores.sort(0, descending=True)
+    num = min(test_cfg.max_num, len(aug_bboxes))
+    order = order[:num]
+
+    merged_bboxes = merged_bboxes[order]
+    merged_scores = merged_scores[order]
+    merged_labels = merged_labels[order]
+
+    return bbox3d2result(merged_bboxes, merged_scores, merged_labels)
diff --git a/mmdet3d/core/utils/__init__.py b/mmdet3d/core/utils/__init__.py
index b2a8dec..5862d44 100644
--- a/mmdet3d/core/utils/__init__.py
+++ b/mmdet3d/core/utils/__init__.py
@@ -1,10 +1,10 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .array_converter import ArrayConverter, array_converter
-from .gaussian import (draw_heatmap_gaussian, ellip_gaussian2D, gaussian_2d,
-                       gaussian_radius, get_ellip_gaussian_2D)
-
-__all__ = [
-    'gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian',
-    'ArrayConverter', 'array_converter', 'ellip_gaussian2D',
-    'get_ellip_gaussian_2D'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .array_converter import ArrayConverter, array_converter
+from .gaussian import (draw_heatmap_gaussian, ellip_gaussian2D, gaussian_2d,
+                       gaussian_radius, get_ellip_gaussian_2D)
+
+__all__ = [
+    'gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian',
+    'ArrayConverter', 'array_converter', 'ellip_gaussian2D',
+    'get_ellip_gaussian_2D'
+]
diff --git a/mmdet3d/core/utils/array_converter.py b/mmdet3d/core/utils/array_converter.py
index a555aa6..bd11c69 100644
--- a/mmdet3d/core/utils/array_converter.py
+++ b/mmdet3d/core/utils/array_converter.py
@@ -1,324 +1,324 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import functools
-from inspect import getfullargspec
-
-import numpy as np
-import torch
-
-
-def array_converter(to_torch=True,
-                    apply_to=tuple(),
-                    template_arg_name_=None,
-                    recover=True):
-    """Wrapper function for data-type agnostic processing.
-
-    First converts input arrays to PyTorch tensors or NumPy ndarrays
-    for middle calculation, then convert output to original data-type if
-    `recover=True`.
-
-    Args:
-        to_torch (Bool, optional): Whether convert to PyTorch tensors
-            for middle calculation. Defaults to True.
-        apply_to (tuple[str], optional): The arguments to which we apply
-            data-type conversion. Defaults to an empty tuple.
-        template_arg_name_ (str, optional): Argument serving as the template (
-            return arrays should have the same dtype and device
-            as the template). Defaults to None. If None, we will use the
-            first argument in `apply_to` as the template argument.
-        recover (Bool, optional): Whether or not recover the wrapped function
-            outputs to the `template_arg_name_` type. Defaults to True.
-
-    Raises:
-        ValueError: When template_arg_name_ is not among all args, or
-            when apply_to contains an arg which is not among all args,
-            a ValueError will be raised. When the template argument or
-            an argument to convert is a list or tuple, and cannot be
-            converted to a NumPy array, a ValueError will be raised.
-        TypeError: When the type of the template argument or
-                an argument to convert does not belong to the above range,
-                or the contents of such an list-or-tuple-type argument
-                do not share the same data type, a TypeError is raised.
-
-    Returns:
-        (function): wrapped function.
-
-    Example:
-        >>> import torch
-        >>> import numpy as np
-        >>>
-        >>> # Use torch addition for a + b,
-        >>> # and convert return values to the type of a
-        >>> @array_converter(apply_to=('a', 'b'))
-        >>> def simple_add(a, b):
-        >>>     return a + b
-        >>>
-        >>> a = np.array([1.1])
-        >>> b = np.array([2.2])
-        >>> simple_add(a, b)
-        >>>
-        >>> # Use numpy addition for a + b,
-        >>> # and convert return values to the type of b
-        >>> @array_converter(to_torch=False, apply_to=('a', 'b'),
-        >>>                  template_arg_name_='b')
-        >>> def simple_add(a, b):
-        >>>     return a + b
-        >>>
-        >>> simple_add()
-        >>>
-        >>> # Use torch funcs for floor(a) if flag=True else ceil(a),
-        >>> # and return the torch tensor
-        >>> @array_converter(apply_to=('a',), recover=False)
-        >>> def floor_or_ceil(a, flag=True):
-        >>>     return torch.floor(a) if flag else torch.ceil(a)
-        >>>
-        >>> floor_or_ceil(a, flag=False)
-    """
-
-    def array_converter_wrapper(func):
-        """Outer wrapper for the function."""
-
-        @functools.wraps(func)
-        def new_func(*args, **kwargs):
-            """Inner wrapper for the arguments."""
-            if len(apply_to) == 0:
-                return func(*args, **kwargs)
-
-            func_name = func.__name__
-
-            arg_spec = getfullargspec(func)
-
-            arg_names = arg_spec.args
-            arg_num = len(arg_names)
-            default_arg_values = arg_spec.defaults
-            if default_arg_values is None:
-                default_arg_values = []
-            no_default_arg_num = len(arg_names) - len(default_arg_values)
-
-            kwonly_arg_names = arg_spec.kwonlyargs
-            kwonly_default_arg_values = arg_spec.kwonlydefaults
-            if kwonly_default_arg_values is None:
-                kwonly_default_arg_values = {}
-
-            all_arg_names = arg_names + kwonly_arg_names
-
-            # in case there are args in the form of *args
-            if len(args) > arg_num:
-                named_args = args[:arg_num]
-                nameless_args = args[arg_num:]
-            else:
-                named_args = args
-                nameless_args = []
-
-            # template argument data type is used for all array-like arguments
-            if template_arg_name_ is None:
-                template_arg_name = apply_to[0]
-            else:
-                template_arg_name = template_arg_name_
-
-            if template_arg_name not in all_arg_names:
-                raise ValueError(f'{template_arg_name} is not among the '
-                                 f'argument list of function {func_name}')
-
-            # inspect apply_to
-            for arg_to_apply in apply_to:
-                if arg_to_apply not in all_arg_names:
-                    raise ValueError(f'{arg_to_apply} is not '
-                                     f'an argument of {func_name}')
-
-            new_args = []
-            new_kwargs = {}
-
-            converter = ArrayConverter()
-            target_type = torch.Tensor if to_torch else np.ndarray
-
-            # non-keyword arguments
-            for i, arg_value in enumerate(named_args):
-                if arg_names[i] in apply_to:
-                    new_args.append(
-                        converter.convert(
-                            input_array=arg_value, target_type=target_type))
-                else:
-                    new_args.append(arg_value)
-
-                if arg_names[i] == template_arg_name:
-                    template_arg_value = arg_value
-
-            kwonly_default_arg_values.update(kwargs)
-            kwargs = kwonly_default_arg_values
-
-            # keyword arguments and non-keyword arguments using default value
-            for i in range(len(named_args), len(all_arg_names)):
-                arg_name = all_arg_names[i]
-                if arg_name in kwargs:
-                    if arg_name in apply_to:
-                        new_kwargs[arg_name] = converter.convert(
-                            input_array=kwargs[arg_name],
-                            target_type=target_type)
-                    else:
-                        new_kwargs[arg_name] = kwargs[arg_name]
-                else:
-                    default_value = default_arg_values[i - no_default_arg_num]
-                    if arg_name in apply_to:
-                        new_kwargs[arg_name] = converter.convert(
-                            input_array=default_value, target_type=target_type)
-                    else:
-                        new_kwargs[arg_name] = default_value
-                if arg_name == template_arg_name:
-                    template_arg_value = kwargs[arg_name]
-
-            # add nameless args provided by *args (if exists)
-            new_args += nameless_args
-
-            return_values = func(*new_args, **new_kwargs)
-            converter.set_template(template_arg_value)
-
-            def recursive_recover(input_data):
-                if isinstance(input_data, (tuple, list)):
-                    new_data = []
-                    for item in input_data:
-                        new_data.append(recursive_recover(item))
-                    return tuple(new_data) if isinstance(input_data,
-                                                         tuple) else new_data
-                elif isinstance(input_data, dict):
-                    new_data = {}
-                    for k, v in input_data.items():
-                        new_data[k] = recursive_recover(v)
-                    return new_data
-                elif isinstance(input_data, (torch.Tensor, np.ndarray)):
-                    return converter.recover(input_data)
-                else:
-                    return input_data
-
-            if recover:
-                return recursive_recover(return_values)
-            else:
-                return return_values
-
-        return new_func
-
-    return array_converter_wrapper
-
-
-class ArrayConverter:
-
-    SUPPORTED_NON_ARRAY_TYPES = (int, float, np.int8, np.int16, np.int32,
-                                 np.int64, np.uint8, np.uint16, np.uint32,
-                                 np.uint64, np.float16, np.float32, np.float64)
-
-    def __init__(self, template_array=None):
-        if template_array is not None:
-            self.set_template(template_array)
-
-    def set_template(self, array):
-        """Set template array.
-
-        Args:
-            array (tuple | list | int | float | np.ndarray | torch.Tensor):
-                Template array.
-
-        Raises:
-            ValueError: If input is list or tuple and cannot be converted to
-                to a NumPy array, a ValueError is raised.
-            TypeError: If input type does not belong to the above range,
-                or the contents of a list or tuple do not share the
-                same data type, a TypeError is raised.
-        """
-        self.array_type = type(array)
-        self.is_num = False
-        self.device = 'cpu'
-
-        if isinstance(array, np.ndarray):
-            self.dtype = array.dtype
-        elif isinstance(array, torch.Tensor):
-            self.dtype = array.dtype
-            self.device = array.device
-        elif isinstance(array, (list, tuple)):
-            try:
-                array = np.array(array)
-                if array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES:
-                    raise TypeError
-                self.dtype = array.dtype
-            except (ValueError, TypeError):
-                print(f'The following list cannot be converted to'
-                      f' a numpy array of supported dtype:\n{array}')
-                raise
-        elif isinstance(array, self.SUPPORTED_NON_ARRAY_TYPES):
-            self.array_type = np.ndarray
-            self.is_num = True
-            self.dtype = np.dtype(type(array))
-        else:
-            raise TypeError(f'Template type {self.array_type}'
-                            f' is not supported.')
-
-    def convert(self, input_array, target_type=None, target_array=None):
-        """Convert input array to target data type.
-
-        Args:
-            input_array (tuple | list | np.ndarray |
-                torch.Tensor | int | float ):
-                Input array. Defaults to None.
-            target_type (<class 'np.ndarray'> | <class 'torch.Tensor'>,
-                optional):
-                Type to which input array is converted. Defaults to None.
-            target_array (np.ndarray | torch.Tensor, optional):
-                Template array to which input array is converted.
-                Defaults to None.
-
-        Raises:
-            ValueError: If input is list or tuple and cannot be converted to
-                to a NumPy array, a ValueError is raised.
-            TypeError: If input type does not belong to the above range,
-                or the contents of a list or tuple do not share the
-                same data type, a TypeError is raised.
-        """
-        if isinstance(input_array, (list, tuple)):
-            try:
-                input_array = np.array(input_array)
-                if input_array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES:
-                    raise TypeError
-            except (ValueError, TypeError):
-                print(f'The input cannot be converted to'
-                      f' a single-type numpy array:\n{input_array}')
-                raise
-        elif isinstance(input_array, self.SUPPORTED_NON_ARRAY_TYPES):
-            input_array = np.array(input_array)
-        array_type = type(input_array)
-        assert target_type is not None or target_array is not None, \
-            'must specify a target'
-        if target_type is not None:
-            assert target_type in (np.ndarray, torch.Tensor), \
-                'invalid target type'
-            if target_type == array_type:
-                return input_array
-            elif target_type == np.ndarray:
-                # default dtype is float32
-                converted_array = input_array.cpu().numpy().astype(np.float32)
-            else:
-                # default dtype is float32, device is 'cpu'
-                converted_array = torch.tensor(
-                    input_array, dtype=torch.float32)
-        else:
-            assert isinstance(target_array, (np.ndarray, torch.Tensor)), \
-                'invalid target array type'
-            if isinstance(target_array, array_type):
-                return input_array
-            elif isinstance(target_array, np.ndarray):
-                converted_array = input_array.cpu().numpy().astype(
-                    target_array.dtype)
-            else:
-                converted_array = target_array.new_tensor(input_array)
-        return converted_array
-
-    def recover(self, input_array):
-        assert isinstance(input_array, (np.ndarray, torch.Tensor)), \
-            'invalid input array type'
-        if isinstance(input_array, self.array_type):
-            return input_array
-        elif isinstance(input_array, torch.Tensor):
-            converted_array = input_array.cpu().numpy().astype(self.dtype)
-        else:
-            converted_array = torch.tensor(
-                input_array, dtype=self.dtype, device=self.device)
-        if self.is_num:
-            converted_array = converted_array.item()
-        return converted_array
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+from inspect import getfullargspec
+
+import numpy as np
+import torch
+
+
+def array_converter(to_torch=True,
+                    apply_to=tuple(),
+                    template_arg_name_=None,
+                    recover=True):
+    """Wrapper function for data-type agnostic processing.
+
+    First converts input arrays to PyTorch tensors or NumPy ndarrays
+    for middle calculation, then convert output to original data-type if
+    `recover=True`.
+
+    Args:
+        to_torch (Bool, optional): Whether convert to PyTorch tensors
+            for middle calculation. Defaults to True.
+        apply_to (tuple[str], optional): The arguments to which we apply
+            data-type conversion. Defaults to an empty tuple.
+        template_arg_name_ (str, optional): Argument serving as the template (
+            return arrays should have the same dtype and device
+            as the template). Defaults to None. If None, we will use the
+            first argument in `apply_to` as the template argument.
+        recover (Bool, optional): Whether or not recover the wrapped function
+            outputs to the `template_arg_name_` type. Defaults to True.
+
+    Raises:
+        ValueError: When template_arg_name_ is not among all args, or
+            when apply_to contains an arg which is not among all args,
+            a ValueError will be raised. When the template argument or
+            an argument to convert is a list or tuple, and cannot be
+            converted to a NumPy array, a ValueError will be raised.
+        TypeError: When the type of the template argument or
+                an argument to convert does not belong to the above range,
+                or the contents of such an list-or-tuple-type argument
+                do not share the same data type, a TypeError is raised.
+
+    Returns:
+        (function): wrapped function.
+
+    Example:
+        >>> import torch
+        >>> import numpy as np
+        >>>
+        >>> # Use torch addition for a + b,
+        >>> # and convert return values to the type of a
+        >>> @array_converter(apply_to=('a', 'b'))
+        >>> def simple_add(a, b):
+        >>>     return a + b
+        >>>
+        >>> a = np.array([1.1])
+        >>> b = np.array([2.2])
+        >>> simple_add(a, b)
+        >>>
+        >>> # Use numpy addition for a + b,
+        >>> # and convert return values to the type of b
+        >>> @array_converter(to_torch=False, apply_to=('a', 'b'),
+        >>>                  template_arg_name_='b')
+        >>> def simple_add(a, b):
+        >>>     return a + b
+        >>>
+        >>> simple_add()
+        >>>
+        >>> # Use torch funcs for floor(a) if flag=True else ceil(a),
+        >>> # and return the torch tensor
+        >>> @array_converter(apply_to=('a',), recover=False)
+        >>> def floor_or_ceil(a, flag=True):
+        >>>     return torch.floor(a) if flag else torch.ceil(a)
+        >>>
+        >>> floor_or_ceil(a, flag=False)
+    """
+
+    def array_converter_wrapper(func):
+        """Outer wrapper for the function."""
+
+        @functools.wraps(func)
+        def new_func(*args, **kwargs):
+            """Inner wrapper for the arguments."""
+            if len(apply_to) == 0:
+                return func(*args, **kwargs)
+
+            func_name = func.__name__
+
+            arg_spec = getfullargspec(func)
+
+            arg_names = arg_spec.args
+            arg_num = len(arg_names)
+            default_arg_values = arg_spec.defaults
+            if default_arg_values is None:
+                default_arg_values = []
+            no_default_arg_num = len(arg_names) - len(default_arg_values)
+
+            kwonly_arg_names = arg_spec.kwonlyargs
+            kwonly_default_arg_values = arg_spec.kwonlydefaults
+            if kwonly_default_arg_values is None:
+                kwonly_default_arg_values = {}
+
+            all_arg_names = arg_names + kwonly_arg_names
+
+            # in case there are args in the form of *args
+            if len(args) > arg_num:
+                named_args = args[:arg_num]
+                nameless_args = args[arg_num:]
+            else:
+                named_args = args
+                nameless_args = []
+
+            # template argument data type is used for all array-like arguments
+            if template_arg_name_ is None:
+                template_arg_name = apply_to[0]
+            else:
+                template_arg_name = template_arg_name_
+
+            if template_arg_name not in all_arg_names:
+                raise ValueError(f'{template_arg_name} is not among the '
+                                 f'argument list of function {func_name}')
+
+            # inspect apply_to
+            for arg_to_apply in apply_to:
+                if arg_to_apply not in all_arg_names:
+                    raise ValueError(f'{arg_to_apply} is not '
+                                     f'an argument of {func_name}')
+
+            new_args = []
+            new_kwargs = {}
+
+            converter = ArrayConverter()
+            target_type = torch.Tensor if to_torch else np.ndarray
+
+            # non-keyword arguments
+            for i, arg_value in enumerate(named_args):
+                if arg_names[i] in apply_to:
+                    new_args.append(
+                        converter.convert(
+                            input_array=arg_value, target_type=target_type))
+                else:
+                    new_args.append(arg_value)
+
+                if arg_names[i] == template_arg_name:
+                    template_arg_value = arg_value
+
+            kwonly_default_arg_values.update(kwargs)
+            kwargs = kwonly_default_arg_values
+
+            # keyword arguments and non-keyword arguments using default value
+            for i in range(len(named_args), len(all_arg_names)):
+                arg_name = all_arg_names[i]
+                if arg_name in kwargs:
+                    if arg_name in apply_to:
+                        new_kwargs[arg_name] = converter.convert(
+                            input_array=kwargs[arg_name],
+                            target_type=target_type)
+                    else:
+                        new_kwargs[arg_name] = kwargs[arg_name]
+                else:
+                    default_value = default_arg_values[i - no_default_arg_num]
+                    if arg_name in apply_to:
+                        new_kwargs[arg_name] = converter.convert(
+                            input_array=default_value, target_type=target_type)
+                    else:
+                        new_kwargs[arg_name] = default_value
+                if arg_name == template_arg_name:
+                    template_arg_value = kwargs[arg_name]
+
+            # add nameless args provided by *args (if exists)
+            new_args += nameless_args
+
+            return_values = func(*new_args, **new_kwargs)
+            converter.set_template(template_arg_value)
+
+            def recursive_recover(input_data):
+                if isinstance(input_data, (tuple, list)):
+                    new_data = []
+                    for item in input_data:
+                        new_data.append(recursive_recover(item))
+                    return tuple(new_data) if isinstance(input_data,
+                                                         tuple) else new_data
+                elif isinstance(input_data, dict):
+                    new_data = {}
+                    for k, v in input_data.items():
+                        new_data[k] = recursive_recover(v)
+                    return new_data
+                elif isinstance(input_data, (torch.Tensor, np.ndarray)):
+                    return converter.recover(input_data)
+                else:
+                    return input_data
+
+            if recover:
+                return recursive_recover(return_values)
+            else:
+                return return_values
+
+        return new_func
+
+    return array_converter_wrapper
+
+
+class ArrayConverter:
+
+    SUPPORTED_NON_ARRAY_TYPES = (int, float, np.int8, np.int16, np.int32,
+                                 np.int64, np.uint8, np.uint16, np.uint32,
+                                 np.uint64, np.float16, np.float32, np.float64)
+
+    def __init__(self, template_array=None):
+        if template_array is not None:
+            self.set_template(template_array)
+
+    def set_template(self, array):
+        """Set template array.
+
+        Args:
+            array (tuple | list | int | float | np.ndarray | torch.Tensor):
+                Template array.
+
+        Raises:
+            ValueError: If input is list or tuple and cannot be converted to
+                to a NumPy array, a ValueError is raised.
+            TypeError: If input type does not belong to the above range,
+                or the contents of a list or tuple do not share the
+                same data type, a TypeError is raised.
+        """
+        self.array_type = type(array)
+        self.is_num = False
+        self.device = 'cpu'
+
+        if isinstance(array, np.ndarray):
+            self.dtype = array.dtype
+        elif isinstance(array, torch.Tensor):
+            self.dtype = array.dtype
+            self.device = array.device
+        elif isinstance(array, (list, tuple)):
+            try:
+                array = np.array(array)
+                if array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES:
+                    raise TypeError
+                self.dtype = array.dtype
+            except (ValueError, TypeError):
+                print(f'The following list cannot be converted to'
+                      f' a numpy array of supported dtype:\n{array}')
+                raise
+        elif isinstance(array, self.SUPPORTED_NON_ARRAY_TYPES):
+            self.array_type = np.ndarray
+            self.is_num = True
+            self.dtype = np.dtype(type(array))
+        else:
+            raise TypeError(f'Template type {self.array_type}'
+                            f' is not supported.')
+
+    def convert(self, input_array, target_type=None, target_array=None):
+        """Convert input array to target data type.
+
+        Args:
+            input_array (tuple | list | np.ndarray |
+                torch.Tensor | int | float ):
+                Input array. Defaults to None.
+            target_type (<class 'np.ndarray'> | <class 'torch.Tensor'>,
+                optional):
+                Type to which input array is converted. Defaults to None.
+            target_array (np.ndarray | torch.Tensor, optional):
+                Template array to which input array is converted.
+                Defaults to None.
+
+        Raises:
+            ValueError: If input is list or tuple and cannot be converted to
+                to a NumPy array, a ValueError is raised.
+            TypeError: If input type does not belong to the above range,
+                or the contents of a list or tuple do not share the
+                same data type, a TypeError is raised.
+        """
+        if isinstance(input_array, (list, tuple)):
+            try:
+                input_array = np.array(input_array)
+                if input_array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES:
+                    raise TypeError
+            except (ValueError, TypeError):
+                print(f'The input cannot be converted to'
+                      f' a single-type numpy array:\n{input_array}')
+                raise
+        elif isinstance(input_array, self.SUPPORTED_NON_ARRAY_TYPES):
+            input_array = np.array(input_array)
+        array_type = type(input_array)
+        assert target_type is not None or target_array is not None, \
+            'must specify a target'
+        if target_type is not None:
+            assert target_type in (np.ndarray, torch.Tensor), \
+                'invalid target type'
+            if target_type == array_type:
+                return input_array
+            elif target_type == np.ndarray:
+                # default dtype is float32
+                converted_array = input_array.cpu().numpy().astype(np.float32)
+            else:
+                # default dtype is float32, device is 'cpu'
+                converted_array = torch.tensor(
+                    input_array, dtype=torch.float32)
+        else:
+            assert isinstance(target_array, (np.ndarray, torch.Tensor)), \
+                'invalid target array type'
+            if isinstance(target_array, array_type):
+                return input_array
+            elif isinstance(target_array, np.ndarray):
+                converted_array = input_array.cpu().numpy().astype(
+                    target_array.dtype)
+            else:
+                converted_array = target_array.new_tensor(input_array)
+        return converted_array
+
+    def recover(self, input_array):
+        assert isinstance(input_array, (np.ndarray, torch.Tensor)), \
+            'invalid input array type'
+        if isinstance(input_array, self.array_type):
+            return input_array
+        elif isinstance(input_array, torch.Tensor):
+            converted_array = input_array.cpu().numpy().astype(self.dtype)
+        else:
+            converted_array = torch.tensor(
+                input_array, dtype=self.dtype, device=self.device)
+        if self.is_num:
+            converted_array = converted_array.item()
+        return converted_array
diff --git a/mmdet3d/core/utils/gaussian.py b/mmdet3d/core/utils/gaussian.py
index 66ccbd9..854faaa 100644
--- a/mmdet3d/core/utils/gaussian.py
+++ b/mmdet3d/core/utils/gaussian.py
@@ -1,158 +1,158 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-
-def gaussian_2d(shape, sigma=1):
-    """Generate gaussian map.
-
-    Args:
-        shape (list[int]): Shape of the map.
-        sigma (float, optional): Sigma to generate gaussian map.
-            Defaults to 1.
-
-    Returns:
-        np.ndarray: Generated gaussian map.
-    """
-    m, n = [(ss - 1.) / 2. for ss in shape]
-    y, x = np.ogrid[-m:m + 1, -n:n + 1]
-
-    h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
-    h[h < np.finfo(h.dtype).eps * h.max()] = 0
-    return h
-
-
-def draw_heatmap_gaussian(heatmap, center, radius, k=1):
-    """Get gaussian masked heatmap.
-
-    Args:
-        heatmap (torch.Tensor): Heatmap to be masked.
-        center (torch.Tensor): Center coord of the heatmap.
-        radius (int): Radius of gaussian.
-        K (int, optional): Multiple of masked_gaussian. Defaults to 1.
-
-    Returns:
-        torch.Tensor: Masked heatmap.
-    """
-    diameter = 2 * radius + 1
-    gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6)
-
-    x, y = int(center[0]), int(center[1])
-
-    height, width = heatmap.shape[0:2]
-
-    left, right = min(x, radius), min(width - x, radius + 1)
-    top, bottom = min(y, radius), min(height - y, radius + 1)
-
-    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
-    masked_gaussian = torch.from_numpy(
-        gaussian[radius - top:radius + bottom,
-                 radius - left:radius + right]).to(heatmap.device,
-                                                   torch.float32)
-    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
-        torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
-    return heatmap
-
-
-def gaussian_radius(det_size, min_overlap=0.5):
-    """Get radius of gaussian.
-
-    Args:
-        det_size (tuple[torch.Tensor]): Size of the detection result.
-        min_overlap (float, optional): Gaussian_overlap. Defaults to 0.5.
-
-    Returns:
-        torch.Tensor: Computed radius.
-    """
-    height, width = det_size
-
-    a1 = 1
-    b1 = (height + width)
-    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
-    sq1 = torch.sqrt(b1**2 - 4 * a1 * c1)
-    r1 = (b1 + sq1) / 2
-
-    a2 = 4
-    b2 = 2 * (height + width)
-    c2 = (1 - min_overlap) * width * height
-    sq2 = torch.sqrt(b2**2 - 4 * a2 * c2)
-    r2 = (b2 + sq2) / 2
-
-    a3 = 4 * min_overlap
-    b3 = -2 * min_overlap * (height + width)
-    c3 = (min_overlap - 1) * width * height
-    sq3 = torch.sqrt(b3**2 - 4 * a3 * c3)
-    r3 = (b3 + sq3) / 2
-    return min(r1, r2, r3)
-
-
-def get_ellip_gaussian_2D(heatmap, center, radius_x, radius_y, k=1):
-    """Generate 2D ellipse gaussian heatmap.
-
-    Args:
-        heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
-            it and maintain the max value.
-        center (list[int]): Coord of gaussian kernel's center.
-        radius_x (int): X-axis radius of gaussian kernel.
-        radius_y (int): Y-axis radius of gaussian kernel.
-        k (int, optional): Coefficient of gaussian kernel. Default: 1.
-
-    Returns:
-        out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
-    """
-    diameter_x, diameter_y = 2 * radius_x + 1, 2 * radius_y + 1
-    gaussian_kernel = ellip_gaussian2D((radius_x, radius_y),
-                                       sigma_x=diameter_x / 6,
-                                       sigma_y=diameter_y / 6,
-                                       dtype=heatmap.dtype,
-                                       device=heatmap.device)
-
-    x, y = int(center[0]), int(center[1])
-    height, width = heatmap.shape[0:2]
-
-    left, right = min(x, radius_x), min(width - x, radius_x + 1)
-    top, bottom = min(y, radius_y), min(height - y, radius_y + 1)
-
-    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
-    masked_gaussian = gaussian_kernel[radius_y - top:radius_y + bottom,
-                                      radius_x - left:radius_x + right]
-    out_heatmap = heatmap
-    torch.max(
-        masked_heatmap,
-        masked_gaussian * k,
-        out=out_heatmap[y - top:y + bottom, x - left:x + right])
-
-    return out_heatmap
-
-
-def ellip_gaussian2D(radius,
-                     sigma_x,
-                     sigma_y,
-                     dtype=torch.float32,
-                     device='cpu'):
-    """Generate 2D ellipse gaussian kernel.
-
-    Args:
-        radius (tuple(int)): Ellipse radius (radius_x, radius_y) of gaussian
-            kernel.
-        sigma_x (int): X-axis sigma of gaussian function.
-        sigma_y (int): Y-axis sigma of gaussian function.
-        dtype (torch.dtype, optional): Dtype of gaussian tensor.
-            Default: torch.float32.
-        device (str, optional): Device of gaussian tensor.
-            Default: 'cpu'.
-
-    Returns:
-        h (Tensor): Gaussian kernel with a
-            ``(2 * radius_y + 1) * (2 * radius_x + 1)`` shape.
-    """
-    x = torch.arange(
-        -radius[0], radius[0] + 1, dtype=dtype, device=device).view(1, -1)
-    y = torch.arange(
-        -radius[1], radius[1] + 1, dtype=dtype, device=device).view(-1, 1)
-
-    h = (-(x * x) / (2 * sigma_x * sigma_x) - (y * y) /
-         (2 * sigma_y * sigma_y)).exp()
-    h[h < torch.finfo(h.dtype).eps * h.max()] = 0
-
-    return h
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def gaussian_2d(shape, sigma=1):
+    """Generate gaussian map.
+
+    Args:
+        shape (list[int]): Shape of the map.
+        sigma (float, optional): Sigma to generate gaussian map.
+            Defaults to 1.
+
+    Returns:
+        np.ndarray: Generated gaussian map.
+    """
+    m, n = [(ss - 1.) / 2. for ss in shape]
+    y, x = np.ogrid[-m:m + 1, -n:n + 1]
+
+    h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def draw_heatmap_gaussian(heatmap, center, radius, k=1):
+    """Get gaussian masked heatmap.
+
+    Args:
+        heatmap (torch.Tensor): Heatmap to be masked.
+        center (torch.Tensor): Center coord of the heatmap.
+        radius (int): Radius of gaussian.
+        K (int, optional): Multiple of masked_gaussian. Defaults to 1.
+
+    Returns:
+        torch.Tensor: Masked heatmap.
+    """
+    diameter = 2 * radius + 1
+    gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6)
+
+    x, y = int(center[0]), int(center[1])
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = torch.from_numpy(
+        gaussian[radius - top:radius + bottom,
+                 radius - left:radius + right]).to(heatmap.device,
+                                                   torch.float32)
+    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
+        torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+    return heatmap
+
+
+def gaussian_radius(det_size, min_overlap=0.5):
+    """Get radius of gaussian.
+
+    Args:
+        det_size (tuple[torch.Tensor]): Size of the detection result.
+        min_overlap (float, optional): Gaussian_overlap. Defaults to 0.5.
+
+    Returns:
+        torch.Tensor: Computed radius.
+    """
+    height, width = det_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = torch.sqrt(b1**2 - 4 * a1 * c1)
+    r1 = (b1 + sq1) / 2
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = torch.sqrt(b2**2 - 4 * a2 * c2)
+    r2 = (b2 + sq2) / 2
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = torch.sqrt(b3**2 - 4 * a3 * c3)
+    r3 = (b3 + sq3) / 2
+    return min(r1, r2, r3)
+
+
+def get_ellip_gaussian_2D(heatmap, center, radius_x, radius_y, k=1):
+    """Generate 2D ellipse gaussian heatmap.
+
+    Args:
+        heatmap (Tensor): Input heatmap, the gaussian kernel will cover on
+            it and maintain the max value.
+        center (list[int]): Coord of gaussian kernel's center.
+        radius_x (int): X-axis radius of gaussian kernel.
+        radius_y (int): Y-axis radius of gaussian kernel.
+        k (int, optional): Coefficient of gaussian kernel. Default: 1.
+
+    Returns:
+        out_heatmap (Tensor): Updated heatmap covered by gaussian kernel.
+    """
+    diameter_x, diameter_y = 2 * radius_x + 1, 2 * radius_y + 1
+    gaussian_kernel = ellip_gaussian2D((radius_x, radius_y),
+                                       sigma_x=diameter_x / 6,
+                                       sigma_y=diameter_y / 6,
+                                       dtype=heatmap.dtype,
+                                       device=heatmap.device)
+
+    x, y = int(center[0]), int(center[1])
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius_x), min(width - x, radius_x + 1)
+    top, bottom = min(y, radius_y), min(height - y, radius_y + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian_kernel[radius_y - top:radius_y + bottom,
+                                      radius_x - left:radius_x + right]
+    out_heatmap = heatmap
+    torch.max(
+        masked_heatmap,
+        masked_gaussian * k,
+        out=out_heatmap[y - top:y + bottom, x - left:x + right])
+
+    return out_heatmap
+
+
+def ellip_gaussian2D(radius,
+                     sigma_x,
+                     sigma_y,
+                     dtype=torch.float32,
+                     device='cpu'):
+    """Generate 2D ellipse gaussian kernel.
+
+    Args:
+        radius (tuple(int)): Ellipse radius (radius_x, radius_y) of gaussian
+            kernel.
+        sigma_x (int): X-axis sigma of gaussian function.
+        sigma_y (int): Y-axis sigma of gaussian function.
+        dtype (torch.dtype, optional): Dtype of gaussian tensor.
+            Default: torch.float32.
+        device (str, optional): Device of gaussian tensor.
+            Default: 'cpu'.
+
+    Returns:
+        h (Tensor): Gaussian kernel with a
+            ``(2 * radius_y + 1) * (2 * radius_x + 1)`` shape.
+    """
+    x = torch.arange(
+        -radius[0], radius[0] + 1, dtype=dtype, device=device).view(1, -1)
+    y = torch.arange(
+        -radius[1], radius[1] + 1, dtype=dtype, device=device).view(-1, 1)
+
+    h = (-(x * x) / (2 * sigma_x * sigma_x) - (y * y) /
+         (2 * sigma_y * sigma_y)).exp()
+    h[h < torch.finfo(h.dtype).eps * h.max()] = 0
+
+    return h
diff --git a/mmdet3d/core/visualizer/__init__.py b/mmdet3d/core/visualizer/__init__.py
index bbf1e60..528a4f5 100644
--- a/mmdet3d/core/visualizer/__init__.py
+++ b/mmdet3d/core/visualizer/__init__.py
@@ -1,5 +1,5 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .show_result import (show_multi_modality_result, show_result,
-                          show_seg_result)
-
-__all__ = ['show_result', 'show_seg_result', 'show_multi_modality_result']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .show_result import (show_multi_modality_result, show_result,
+                          show_seg_result)
+
+__all__ = ['show_result', 'show_seg_result', 'show_multi_modality_result']
diff --git a/mmdet3d/core/visualizer/image_vis.py b/mmdet3d/core/visualizer/image_vis.py
index 7ac765c..21cfb1e 100644
--- a/mmdet3d/core/visualizer/image_vis.py
+++ b/mmdet3d/core/visualizer/image_vis.py
@@ -1,206 +1,206 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import cv2
-import numpy as np
-import torch
-from matplotlib import pyplot as plt
-
-
-def project_pts_on_img(points,
-                       raw_img,
-                       lidar2img_rt,
-                       max_distance=70,
-                       thickness=-1):
-    """Project the 3D points cloud on 2D image.
-
-    Args:
-        points (numpy.array): 3D points cloud (x, y, z) to visualize.
-        raw_img (numpy.array): The numpy array of image.
-        lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix
-            according to the camera intrinsic parameters.
-        max_distance (float, optional): the max distance of the points cloud.
-            Default: 70.
-        thickness (int, optional): The thickness of 2D points. Default: -1.
-    """
-    img = raw_img.copy()
-    num_points = points.shape[0]
-    pts_4d = np.concatenate([points[:, :3], np.ones((num_points, 1))], axis=-1)
-    pts_2d = pts_4d @ lidar2img_rt.T
-
-    # cam_points is Tensor of Nx4 whose last column is 1
-    # transform camera coordinate to image coordinate
-    pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=99999)
-    pts_2d[:, 0] /= pts_2d[:, 2]
-    pts_2d[:, 1] /= pts_2d[:, 2]
-
-    fov_inds = ((pts_2d[:, 0] < img.shape[1])
-                & (pts_2d[:, 0] >= 0)
-                & (pts_2d[:, 1] < img.shape[0])
-                & (pts_2d[:, 1] >= 0))
-
-    imgfov_pts_2d = pts_2d[fov_inds, :3]  # u, v, d
-
-    cmap = plt.cm.get_cmap('hsv', 256)
-    cmap = np.array([cmap(i) for i in range(256)])[:, :3] * 255
-    for i in range(imgfov_pts_2d.shape[0]):
-        depth = imgfov_pts_2d[i, 2]
-        color = cmap[np.clip(int(max_distance * 10 / depth), 0, 255), :]
-        cv2.circle(
-            img,
-            center=(int(np.round(imgfov_pts_2d[i, 0])),
-                    int(np.round(imgfov_pts_2d[i, 1]))),
-            radius=1,
-            color=tuple(color),
-            thickness=thickness,
-        )
-    cv2.imshow('project_pts_img', img.astype(np.uint8))
-    cv2.waitKey(100)
-
-
-def plot_rect3d_on_img(img,
-                       num_rects,
-                       rect_corners,
-                       color=(0, 255, 0),
-                       thickness=1):
-    """Plot the boundary lines of 3D rectangular on 2D images.
-
-    Args:
-        img (numpy.array): The numpy array of image.
-        num_rects (int): Number of 3D rectangulars.
-        rect_corners (numpy.array): Coordinates of the corners of 3D
-            rectangulars. Should be in the shape of [num_rect, 8, 2].
-        color (tuple[int], optional): The color to draw bboxes.
-            Default: (0, 255, 0).
-        thickness (int, optional): The thickness of bboxes. Default: 1.
-    """
-    line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7),
-                    (4, 5), (4, 7), (2, 6), (5, 6), (6, 7))
-    for i in range(num_rects):
-        corners = rect_corners[i].astype(np.int)
-        for start, end in line_indices:
-            cv2.line(img, (corners[start, 0], corners[start, 1]),
-                     (corners[end, 0], corners[end, 1]), color, thickness,
-                     cv2.LINE_AA)
-
-    return img.astype(np.uint8)
-
-
-def draw_lidar_bbox3d_on_img(bboxes3d,
-                             raw_img,
-                             lidar2img_rt,
-                             img_metas,
-                             color=(0, 255, 0),
-                             thickness=1):
-    """Project the 3D bbox on 2D plane and draw on input image.
-
-    Args:
-        bboxes3d (:obj:`LiDARInstance3DBoxes`):
-            3d bbox in lidar coordinate system to visualize.
-        raw_img (numpy.array): The numpy array of image.
-        lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix
-            according to the camera intrinsic parameters.
-        img_metas (dict): Useless here.
-        color (tuple[int], optional): The color to draw bboxes.
-            Default: (0, 255, 0).
-        thickness (int, optional): The thickness of bboxes. Default: 1.
-    """
-    img = raw_img.copy()
-    corners_3d = bboxes3d.corners
-    num_bbox = corners_3d.shape[0]
-    pts_4d = np.concatenate(
-        [corners_3d.reshape(-1, 3),
-         np.ones((num_bbox * 8, 1))], axis=-1)
-    lidar2img_rt = copy.deepcopy(lidar2img_rt).reshape(4, 4)
-    if isinstance(lidar2img_rt, torch.Tensor):
-        lidar2img_rt = lidar2img_rt.cpu().numpy()
-    pts_2d = pts_4d @ lidar2img_rt.T
-
-    pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5)
-    pts_2d[:, 0] /= pts_2d[:, 2]
-    pts_2d[:, 1] /= pts_2d[:, 2]
-    imgfov_pts_2d = pts_2d[..., :2].reshape(num_bbox, 8, 2)
-
-    return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
-
-
-# TODO: remove third parameter in all functions here in favour of img_metas
-def draw_depth_bbox3d_on_img(bboxes3d,
-                             raw_img,
-                             calibs,
-                             img_metas,
-                             color=(0, 255, 0),
-                             thickness=1):
-    """Project the 3D bbox on 2D plane and draw on input image.
-
-    Args:
-        bboxes3d (:obj:`DepthInstance3DBoxes`, shape=[M, 7]):
-            3d bbox in depth coordinate system to visualize.
-        raw_img (numpy.array): The numpy array of image.
-        calibs (dict): Camera calibration information, Rt and K.
-        img_metas (dict): Used in coordinates transformation.
-        color (tuple[int], optional): The color to draw bboxes.
-            Default: (0, 255, 0).
-        thickness (int, optional): The thickness of bboxes. Default: 1.
-    """
-    from mmdet3d.core.bbox import points_cam2img
-    from mmdet3d.models import apply_3d_transformation
-
-    img = raw_img.copy()
-    img_metas = copy.deepcopy(img_metas)
-    corners_3d = bboxes3d.corners
-    num_bbox = corners_3d.shape[0]
-    points_3d = corners_3d.reshape(-1, 3)
-
-    # first reverse the data transformations
-    xyz_depth = apply_3d_transformation(
-        points_3d, 'DEPTH', img_metas, reverse=True)
-
-    # project to 2d to get image coords (uv)
-    uv_origin = points_cam2img(xyz_depth,
-                               xyz_depth.new_tensor(img_metas['depth2img']))
-    uv_origin = (uv_origin - 1).round()
-    imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()
-
-    return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
-
-
-def draw_camera_bbox3d_on_img(bboxes3d,
-                              raw_img,
-                              cam2img,
-                              img_metas,
-                              color=(0, 255, 0),
-                              thickness=1):
-    """Project the 3D bbox on 2D plane and draw on input image.
-
-    Args:
-        bboxes3d (:obj:`CameraInstance3DBoxes`, shape=[M, 7]):
-            3d bbox in camera coordinate system to visualize.
-        raw_img (numpy.array): The numpy array of image.
-        cam2img (dict): Camera intrinsic matrix,
-            denoted as `K` in depth bbox coordinate system.
-        img_metas (dict): Useless here.
-        color (tuple[int], optional): The color to draw bboxes.
-            Default: (0, 255, 0).
-        thickness (int, optional): The thickness of bboxes. Default: 1.
-    """
-    from mmdet3d.core.bbox import points_cam2img
-
-    img = raw_img.copy()
-    cam2img = copy.deepcopy(cam2img)
-    corners_3d = bboxes3d.corners
-    num_bbox = corners_3d.shape[0]
-    points_3d = corners_3d.reshape(-1, 3)
-    if not isinstance(cam2img, torch.Tensor):
-        cam2img = torch.from_numpy(np.array(cam2img))
-
-    assert (cam2img.shape == torch.Size([3, 3])
-            or cam2img.shape == torch.Size([4, 4]))
-    cam2img = cam2img.float().cpu()
-
-    # project to 2d to get image coords (uv)
-    uv_origin = points_cam2img(points_3d, cam2img)
-    uv_origin = (uv_origin - 1).round()
-    imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()
-
-    return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import cv2
+import numpy as np
+import torch
+from matplotlib import pyplot as plt
+
+
+def project_pts_on_img(points,
+                       raw_img,
+                       lidar2img_rt,
+                       max_distance=70,
+                       thickness=-1):
+    """Project the 3D points cloud on 2D image.
+
+    Args:
+        points (numpy.array): 3D points cloud (x, y, z) to visualize.
+        raw_img (numpy.array): The numpy array of image.
+        lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix
+            according to the camera intrinsic parameters.
+        max_distance (float, optional): the max distance of the points cloud.
+            Default: 70.
+        thickness (int, optional): The thickness of 2D points. Default: -1.
+    """
+    img = raw_img.copy()
+    num_points = points.shape[0]
+    pts_4d = np.concatenate([points[:, :3], np.ones((num_points, 1))], axis=-1)
+    pts_2d = pts_4d @ lidar2img_rt.T
+
+    # cam_points is Tensor of Nx4 whose last column is 1
+    # transform camera coordinate to image coordinate
+    pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=99999)
+    pts_2d[:, 0] /= pts_2d[:, 2]
+    pts_2d[:, 1] /= pts_2d[:, 2]
+
+    fov_inds = ((pts_2d[:, 0] < img.shape[1])
+                & (pts_2d[:, 0] >= 0)
+                & (pts_2d[:, 1] < img.shape[0])
+                & (pts_2d[:, 1] >= 0))
+
+    imgfov_pts_2d = pts_2d[fov_inds, :3]  # u, v, d
+
+    cmap = plt.cm.get_cmap('hsv', 256)
+    cmap = np.array([cmap(i) for i in range(256)])[:, :3] * 255
+    for i in range(imgfov_pts_2d.shape[0]):
+        depth = imgfov_pts_2d[i, 2]
+        color = cmap[np.clip(int(max_distance * 10 / depth), 0, 255), :]
+        cv2.circle(
+            img,
+            center=(int(np.round(imgfov_pts_2d[i, 0])),
+                    int(np.round(imgfov_pts_2d[i, 1]))),
+            radius=1,
+            color=tuple(color),
+            thickness=thickness,
+        )
+    cv2.imshow('project_pts_img', img.astype(np.uint8))
+    cv2.waitKey(100)
+
+
+def plot_rect3d_on_img(img,
+                       num_rects,
+                       rect_corners,
+                       color=(0, 255, 0),
+                       thickness=1):
+    """Plot the boundary lines of 3D rectangular on 2D images.
+
+    Args:
+        img (numpy.array): The numpy array of image.
+        num_rects (int): Number of 3D rectangulars.
+        rect_corners (numpy.array): Coordinates of the corners of 3D
+            rectangulars. Should be in the shape of [num_rect, 8, 2].
+        color (tuple[int], optional): The color to draw bboxes.
+            Default: (0, 255, 0).
+        thickness (int, optional): The thickness of bboxes. Default: 1.
+    """
+    line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7),
+                    (4, 5), (4, 7), (2, 6), (5, 6), (6, 7))
+    for i in range(num_rects):
+        corners = rect_corners[i].astype(np.int)
+        for start, end in line_indices:
+            cv2.line(img, (corners[start, 0], corners[start, 1]),
+                     (corners[end, 0], corners[end, 1]), color, thickness,
+                     cv2.LINE_AA)
+
+    return img.astype(np.uint8)
+
+
+def draw_lidar_bbox3d_on_img(bboxes3d,
+                             raw_img,
+                             lidar2img_rt,
+                             img_metas,
+                             color=(0, 255, 0),
+                             thickness=1):
+    """Project the 3D bbox on 2D plane and draw on input image.
+
+    Args:
+        bboxes3d (:obj:`LiDARInstance3DBoxes`):
+            3d bbox in lidar coordinate system to visualize.
+        raw_img (numpy.array): The numpy array of image.
+        lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix
+            according to the camera intrinsic parameters.
+        img_metas (dict): Useless here.
+        color (tuple[int], optional): The color to draw bboxes.
+            Default: (0, 255, 0).
+        thickness (int, optional): The thickness of bboxes. Default: 1.
+    """
+    img = raw_img.copy()
+    corners_3d = bboxes3d.corners
+    num_bbox = corners_3d.shape[0]
+    pts_4d = np.concatenate(
+        [corners_3d.reshape(-1, 3),
+         np.ones((num_bbox * 8, 1))], axis=-1)
+    lidar2img_rt = copy.deepcopy(lidar2img_rt).reshape(4, 4)
+    if isinstance(lidar2img_rt, torch.Tensor):
+        lidar2img_rt = lidar2img_rt.cpu().numpy()
+    pts_2d = pts_4d @ lidar2img_rt.T
+
+    pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5)
+    pts_2d[:, 0] /= pts_2d[:, 2]
+    pts_2d[:, 1] /= pts_2d[:, 2]
+    imgfov_pts_2d = pts_2d[..., :2].reshape(num_bbox, 8, 2)
+
+    return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
+
+
+# TODO: remove third parameter in all functions here in favour of img_metas
+def draw_depth_bbox3d_on_img(bboxes3d,
+                             raw_img,
+                             calibs,
+                             img_metas,
+                             color=(0, 255, 0),
+                             thickness=1):
+    """Project the 3D bbox on 2D plane and draw on input image.
+
+    Args:
+        bboxes3d (:obj:`DepthInstance3DBoxes`, shape=[M, 7]):
+            3d bbox in depth coordinate system to visualize.
+        raw_img (numpy.array): The numpy array of image.
+        calibs (dict): Camera calibration information, Rt and K.
+        img_metas (dict): Used in coordinates transformation.
+        color (tuple[int], optional): The color to draw bboxes.
+            Default: (0, 255, 0).
+        thickness (int, optional): The thickness of bboxes. Default: 1.
+    """
+    from mmdet3d.core.bbox import points_cam2img
+    from mmdet3d.models import apply_3d_transformation
+
+    img = raw_img.copy()
+    img_metas = copy.deepcopy(img_metas)
+    corners_3d = bboxes3d.corners
+    num_bbox = corners_3d.shape[0]
+    points_3d = corners_3d.reshape(-1, 3)
+
+    # first reverse the data transformations
+    xyz_depth = apply_3d_transformation(
+        points_3d, 'DEPTH', img_metas, reverse=True)
+
+    # project to 2d to get image coords (uv)
+    uv_origin = points_cam2img(xyz_depth,
+                               xyz_depth.new_tensor(img_metas['depth2img']))
+    uv_origin = (uv_origin - 1).round()
+    imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()
+
+    return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
+
+
+def draw_camera_bbox3d_on_img(bboxes3d,
+                              raw_img,
+                              cam2img,
+                              img_metas,
+                              color=(0, 255, 0),
+                              thickness=1):
+    """Project the 3D bbox on 2D plane and draw on input image.
+
+    Args:
+        bboxes3d (:obj:`CameraInstance3DBoxes`, shape=[M, 7]):
+            3d bbox in camera coordinate system to visualize.
+        raw_img (numpy.array): The numpy array of image.
+        cam2img (dict): Camera intrinsic matrix,
+            denoted as `K` in depth bbox coordinate system.
+        img_metas (dict): Useless here.
+        color (tuple[int], optional): The color to draw bboxes.
+            Default: (0, 255, 0).
+        thickness (int, optional): The thickness of bboxes. Default: 1.
+    """
+    from mmdet3d.core.bbox import points_cam2img
+
+    img = raw_img.copy()
+    cam2img = copy.deepcopy(cam2img)
+    corners_3d = bboxes3d.corners
+    num_bbox = corners_3d.shape[0]
+    points_3d = corners_3d.reshape(-1, 3)
+    if not isinstance(cam2img, torch.Tensor):
+        cam2img = torch.from_numpy(np.array(cam2img))
+
+    assert (cam2img.shape == torch.Size([3, 3])
+            or cam2img.shape == torch.Size([4, 4]))
+    cam2img = cam2img.float().cpu()
+
+    # project to 2d to get image coords (uv)
+    uv_origin = points_cam2img(points_3d, cam2img)
+    uv_origin = (uv_origin - 1).round()
+    imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()
+
+    return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
diff --git a/mmdet3d/core/visualizer/open3d_vis.py b/mmdet3d/core/visualizer/open3d_vis.py
index c63b6ec..ef52323 100644
--- a/mmdet3d/core/visualizer/open3d_vis.py
+++ b/mmdet3d/core/visualizer/open3d_vis.py
@@ -1,460 +1,460 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import numpy as np
-import torch
-
-try:
-    import open3d as o3d
-    from open3d import geometry
-except ImportError:
-    raise ImportError(
-        'Please run "pip install open3d" to install open3d first.')
-
-
-def _draw_points(points,
-                 vis,
-                 points_size=2,
-                 point_color=(0.5, 0.5, 0.5),
-                 mode='xyz'):
-    """Draw points on visualizer.
-
-    Args:
-        points (numpy.array | torch.tensor, shape=[N, 3+C]):
-            points to visualize.
-        vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
-        points_size (int, optional): the size of points to show on visualizer.
-            Default: 2.
-        point_color (tuple[float], optional): the color of points.
-            Default: (0.5, 0.5, 0.5).
-        mode (str, optional):  indicate type of the input points,
-            available mode ['xyz', 'xyzrgb']. Default: 'xyz'.
-
-    Returns:
-        tuple: points, color of each point.
-    """
-    vis.get_render_option().point_size = points_size  # set points size
-    if isinstance(points, torch.Tensor):
-        points = points.cpu().numpy()
-
-    points = points.copy()
-    pcd = geometry.PointCloud()
-    if mode == 'xyz':
-        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
-        points_colors = np.tile(np.array(point_color), (points.shape[0], 1))
-    elif mode == 'xyzrgb':
-        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
-        points_colors = points[:, 3:6]
-        # normalize to [0, 1] for open3d drawing
-        if not ((points_colors >= 0.0) & (points_colors <= 1.0)).all():
-            points_colors /= 255.0
-    else:
-        raise NotImplementedError
-
-    pcd.colors = o3d.utility.Vector3dVector(points_colors)
-    vis.add_geometry(pcd)
-
-    return pcd, points_colors
-
-
-def _draw_bboxes(bbox3d,
-                 vis,
-                 points_colors,
-                 pcd=None,
-                 bbox_color=(0, 1, 0),
-                 points_in_box_color=(1, 0, 0),
-                 rot_axis=2,
-                 center_mode='lidar_bottom',
-                 mode='xyz'):
-    """Draw bbox on visualizer and change the color of points inside bbox3d.
-
-    Args:
-        bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
-            3d bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize.
-        vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
-        points_colors (numpy.array): color of each points.
-        pcd (:obj:`open3d.geometry.PointCloud`, optional): point cloud.
-            Default: None.
-        bbox_color (tuple[float], optional): the color of bbox.
-            Default: (0, 1, 0).
-        points_in_box_color (tuple[float], optional):
-            the color of points inside bbox3d. Default: (1, 0, 0).
-        rot_axis (int, optional): rotation axis of bbox. Default: 2.
-        center_mode (bool, optional): indicate the center of bbox is
-            bottom center or gravity center. available mode
-            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
-        mode (str, optional):  indicate type of the input points,
-            available mode ['xyz', 'xyzrgb']. Default: 'xyz'.
-    """
-    if isinstance(bbox3d, torch.Tensor):
-        bbox3d = bbox3d.cpu().numpy()
-    bbox3d = bbox3d.copy()
-
-    in_box_color = np.array(points_in_box_color)
-    for i in range(len(bbox3d)):
-        center = bbox3d[i, 0:3]
-        dim = bbox3d[i, 3:6]
-        yaw = np.zeros(3)
-        yaw[rot_axis] = bbox3d[i, 6]
-        rot_mat = geometry.get_rotation_matrix_from_xyz(yaw)
-
-        if center_mode == 'lidar_bottom':
-            center[rot_axis] += dim[
-                rot_axis] / 2  # bottom center to gravity center
-        elif center_mode == 'camera_bottom':
-            center[rot_axis] -= dim[
-                rot_axis] / 2  # bottom center to gravity center
-        box3d = geometry.OrientedBoundingBox(center, rot_mat, dim)
-
-        line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d)
-        line_set.paint_uniform_color(bbox_color)
-        # draw bboxes on visualizer
-        vis.add_geometry(line_set)
-
-        # change the color of points which are in box
-        if pcd is not None and mode == 'xyz':
-            indices = box3d.get_point_indices_within_bounding_box(pcd.points)
-            points_colors[indices] = in_box_color
-
-    # update points colors
-    if pcd is not None:
-        pcd.colors = o3d.utility.Vector3dVector(points_colors)
-        vis.update_geometry(pcd)
-
-
-def show_pts_boxes(points,
-                   bbox3d=None,
-                   show=True,
-                   save_path=None,
-                   points_size=2,
-                   point_color=(0.5, 0.5, 0.5),
-                   bbox_color=(0, 1, 0),
-                   points_in_box_color=(1, 0, 0),
-                   rot_axis=2,
-                   center_mode='lidar_bottom',
-                   mode='xyz'):
-    """Draw bbox and points on visualizer.
-
-    Args:
-        points (numpy.array | torch.tensor, shape=[N, 3+C]):
-            points to visualize.
-        bbox3d (numpy.array | torch.tensor, shape=[M, 7], optional):
-            3D bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize.
-            Defaults to None.
-        show (bool, optional): whether to show the visualization results.
-            Default: True.
-        save_path (str, optional): path to save visualized results.
-            Default: None.
-        points_size (int, optional): the size of points to show on visualizer.
-            Default: 2.
-        point_color (tuple[float], optional): the color of points.
-            Default: (0.5, 0.5, 0.5).
-        bbox_color (tuple[float], optional): the color of bbox.
-            Default: (0, 1, 0).
-        points_in_box_color (tuple[float], optional):
-            the color of points which are in bbox3d. Default: (1, 0, 0).
-        rot_axis (int, optional): rotation axis of bbox. Default: 2.
-        center_mode (bool, optional): indicate the center of bbox is bottom
-            center or gravity center. available mode
-            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
-        mode (str, optional):  indicate type of the input points, available
-            mode ['xyz', 'xyzrgb']. Default: 'xyz'.
-    """
-    # TODO: support score and class info
-    assert 0 <= rot_axis <= 2
-
-    # init visualizer
-    vis = o3d.visualization.Visualizer()
-    vis.create_window()
-    mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
-        size=1, origin=[0, 0, 0])  # create coordinate frame
-    vis.add_geometry(mesh_frame)
-
-    # draw points
-    pcd, points_colors = _draw_points(points, vis, points_size, point_color,
-                                      mode)
-
-    # draw boxes
-    if bbox3d is not None:
-        _draw_bboxes(bbox3d, vis, points_colors, pcd, bbox_color,
-                     points_in_box_color, rot_axis, center_mode, mode)
-
-    if show:
-        vis.run()
-
-    if save_path is not None:
-        vis.capture_screen_image(save_path)
-
-    vis.destroy_window()
-
-
-def _draw_bboxes_ind(bbox3d,
-                     vis,
-                     indices,
-                     points_colors,
-                     pcd=None,
-                     bbox_color=(0, 1, 0),
-                     points_in_box_color=(1, 0, 0),
-                     rot_axis=2,
-                     center_mode='lidar_bottom',
-                     mode='xyz'):
-    """Draw bbox on visualizer and change the color or points inside bbox3d
-    with indices.
-
-    Args:
-        bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
-            3d bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize.
-        vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
-        indices (numpy.array | torch.tensor, shape=[N, M]):
-            indicate which bbox3d that each point lies in.
-        points_colors (numpy.array): color of each points.
-        pcd (:obj:`open3d.geometry.PointCloud`, optional): point cloud.
-            Default: None.
-        bbox_color (tuple[float], optional): the color of bbox.
-            Default: (0, 1, 0).
-        points_in_box_color (tuple[float], optional):
-            the color of points which are in bbox3d. Default: (1, 0, 0).
-        rot_axis (int, optional): rotation axis of bbox. Default: 2.
-        center_mode (bool, optional): indicate the center of bbox is
-            bottom center or gravity center. available mode
-            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
-        mode (str, optional):  indicate type of the input points,
-            available mode ['xyz', 'xyzrgb']. Default: 'xyz'.
-    """
-    if isinstance(bbox3d, torch.Tensor):
-        bbox3d = bbox3d.cpu().numpy()
-    if isinstance(indices, torch.Tensor):
-        indices = indices.cpu().numpy()
-    bbox3d = bbox3d.copy()
-
-    in_box_color = np.array(points_in_box_color)
-    for i in range(len(bbox3d)):
-        center = bbox3d[i, 0:3]
-        dim = bbox3d[i, 3:6]
-        yaw = np.zeros(3)
-        # TODO: fix problem of current coordinate system
-        # dim[0], dim[1] = dim[1], dim[0]  # for current coordinate
-        # yaw[rot_axis] = -(bbox3d[i, 6] - 0.5 * np.pi)
-        yaw[rot_axis] = -bbox3d[i, 6]
-        rot_mat = geometry.get_rotation_matrix_from_xyz(yaw)
-        if center_mode == 'lidar_bottom':
-            center[rot_axis] += dim[
-                rot_axis] / 2  # bottom center to gravity center
-        elif center_mode == 'camera_bottom':
-            center[rot_axis] -= dim[
-                rot_axis] / 2  # bottom center to gravity center
-        box3d = geometry.OrientedBoundingBox(center, rot_mat, dim)
-
-        line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d)
-        line_set.paint_uniform_color(bbox_color)
-        # draw bboxes on visualizer
-        vis.add_geometry(line_set)
-
-        # change the color of points which are in box
-        if pcd is not None and mode == 'xyz':
-            points_colors[indices[:, i].astype(np.bool)] = in_box_color
-
-    # update points colors
-    if pcd is not None:
-        pcd.colors = o3d.utility.Vector3dVector(points_colors)
-        vis.update_geometry(pcd)
-
-
-def show_pts_index_boxes(points,
-                         bbox3d=None,
-                         show=True,
-                         indices=None,
-                         save_path=None,
-                         points_size=2,
-                         point_color=(0.5, 0.5, 0.5),
-                         bbox_color=(0, 1, 0),
-                         points_in_box_color=(1, 0, 0),
-                         rot_axis=2,
-                         center_mode='lidar_bottom',
-                         mode='xyz'):
-    """Draw bbox and points on visualizer with indices that indicate which
-    bbox3d that each point lies in.
-
-    Args:
-        points (numpy.array | torch.tensor, shape=[N, 3+C]):
-            points to visualize.
-        bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
-            3D bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize.
-            Defaults to None.
-        show (bool, optional): whether to show the visualization results.
-            Default: True.
-        indices (numpy.array | torch.tensor, shape=[N, M], optional):
-            indicate which bbox3d that each point lies in. Default: None.
-        save_path (str, optional): path to save visualized results.
-            Default: None.
-        points_size (int, optional): the size of points to show on visualizer.
-            Default: 2.
-        point_color (tuple[float], optional): the color of points.
-            Default: (0.5, 0.5, 0.5).
-        bbox_color (tuple[float], optional): the color of bbox.
-            Default: (0, 1, 0).
-        points_in_box_color (tuple[float], optional):
-            the color of points which are in bbox3d. Default: (1, 0, 0).
-        rot_axis (int, optional): rotation axis of bbox. Default: 2.
-        center_mode (bool, optional): indicate the center of bbox is
-            bottom center or gravity center. available mode
-            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
-        mode (str, optional):  indicate type of the input points,
-            available mode ['xyz', 'xyzrgb']. Default: 'xyz'.
-    """
-    # TODO: support score and class info
-    assert 0 <= rot_axis <= 2
-
-    # init visualizer
-    vis = o3d.visualization.Visualizer()
-    vis.create_window()
-    mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
-        size=1, origin=[0, 0, 0])  # create coordinate frame
-    vis.add_geometry(mesh_frame)
-
-    # draw points
-    pcd, points_colors = _draw_points(points, vis, points_size, point_color,
-                                      mode)
-
-    # draw boxes
-    if bbox3d is not None:
-        _draw_bboxes_ind(bbox3d, vis, indices, points_colors, pcd, bbox_color,
-                         points_in_box_color, rot_axis, center_mode, mode)
-
-    if show:
-        vis.run()
-
-    if save_path is not None:
-        vis.capture_screen_image(save_path)
-
-    vis.destroy_window()
-
-
-class Visualizer(object):
-    r"""Online visualizer implemented with Open3d.
-
-    Args:
-        points (numpy.array, shape=[N, 3+C]): Points to visualize. The Points
-            cloud is in mode of Coord3DMode.DEPTH (please refer to
-            core.structures.coord_3d_mode).
-        bbox3d (numpy.array, shape=[M, 7], optional): 3D bbox
-            (x, y, z, x_size, y_size, z_size, yaw) to visualize.
-            The 3D bbox is in mode of Box3DMode.DEPTH with
-            gravity_center (please refer to core.structures.box_3d_mode).
-            Default: None.
-        save_path (str, optional): path to save visualized results.
-            Default: None.
-        points_size (int, optional): the size of points to show on visualizer.
-            Default: 2.
-        point_color (tuple[float], optional): the color of points.
-            Default: (0.5, 0.5, 0.5).
-        bbox_color (tuple[float], optional): the color of bbox.
-            Default: (0, 1, 0).
-        points_in_box_color (tuple[float], optional):
-            the color of points which are in bbox3d. Default: (1, 0, 0).
-        rot_axis (int, optional): rotation axis of bbox. Default: 2.
-        center_mode (bool, optional): indicate the center of bbox is
-            bottom center or gravity center. available mode
-            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
-        mode (str, optional):  indicate type of the input points,
-            available mode ['xyz', 'xyzrgb']. Default: 'xyz'.
-    """
-
-    def __init__(self,
-                 points,
-                 bbox3d=None,
-                 save_path=None,
-                 points_size=2,
-                 point_color=(0.5, 0.5, 0.5),
-                 bbox_color=(0, 1, 0),
-                 points_in_box_color=(1, 0, 0),
-                 rot_axis=2,
-                 center_mode='lidar_bottom',
-                 mode='xyz'):
-        super(Visualizer, self).__init__()
-        assert 0 <= rot_axis <= 2
-
-        # init visualizer
-        self.o3d_visualizer = o3d.visualization.Visualizer()
-        self.o3d_visualizer.create_window()
-        mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
-            size=1, origin=[0, 0, 0])  # create coordinate frame
-        self.o3d_visualizer.add_geometry(mesh_frame)
-
-        self.points_size = points_size
-        self.point_color = point_color
-        self.bbox_color = bbox_color
-        self.points_in_box_color = points_in_box_color
-        self.rot_axis = rot_axis
-        self.center_mode = center_mode
-        self.mode = mode
-        self.seg_num = 0
-
-        # draw points
-        if points is not None:
-            self.pcd, self.points_colors = _draw_points(
-                points, self.o3d_visualizer, points_size, point_color, mode)
-
-        # draw boxes
-        if bbox3d is not None:
-            _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors,
-                         self.pcd, bbox_color, points_in_box_color, rot_axis,
-                         center_mode, mode)
-
-    def add_bboxes(self, bbox3d, bbox_color=None, points_in_box_color=None):
-        """Add bounding box to visualizer.
-
-        Args:
-            bbox3d (numpy.array, shape=[M, 7]):
-                3D bbox (x, y, z, x_size, y_size, z_size, yaw)
-                to be visualized. The 3d bbox is in mode of
-                Box3DMode.DEPTH with gravity_center (please refer to
-                core.structures.box_3d_mode).
-            bbox_color (tuple[float]): the color of bbox. Default: None.
-            points_in_box_color (tuple[float]): the color of points which
-                are in bbox3d. Default: None.
-        """
-        if bbox_color is None:
-            bbox_color = self.bbox_color
-        if points_in_box_color is None:
-            points_in_box_color = self.points_in_box_color
-        _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors, self.pcd,
-                     bbox_color, points_in_box_color, self.rot_axis,
-                     self.center_mode, self.mode)
-
-    def add_seg_mask(self, seg_mask_colors):
-        """Add segmentation mask to visualizer via per-point colorization.
-
-        Args:
-            seg_mask_colors (numpy.array, shape=[N, 6]):
-                The segmentation mask whose first 3 dims are point coordinates
-                and last 3 dims are converted colors.
-        """
-        # we can't draw the colors on existing points
-        # in case gt and pred mask would overlap
-        # instead we set a large offset along x-axis for each seg mask
-        self.seg_num += 1
-        offset = (np.array(self.pcd.points).max(0) -
-                  np.array(self.pcd.points).min(0))[0] * 1.2 * self.seg_num
-        mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
-            size=1, origin=[offset, 0, 0])  # create coordinate frame for seg
-        self.o3d_visualizer.add_geometry(mesh_frame)
-        seg_points = copy.deepcopy(seg_mask_colors)
-        seg_points[:, 0] += offset
-        _draw_points(
-            seg_points, self.o3d_visualizer, self.points_size, mode='xyzrgb')
-
-    def show(self, save_path=None):
-        """Visualize the points cloud.
-
-        Args:
-            save_path (str, optional): path to save image. Default: None.
-        """
-
-        self.o3d_visualizer.run()
-
-        if save_path is not None:
-            self.o3d_visualizer.capture_screen_image(save_path)
-
-        self.o3d_visualizer.destroy_window()
-        return
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import torch
+
+try:
+    import open3d as o3d
+    from open3d import geometry
+except ImportError:
+    raise ImportError(
+        'Please run "pip install open3d" to install open3d first.')
+
+
+def _draw_points(points,
+                 vis,
+                 points_size=2,
+                 point_color=(0.5, 0.5, 0.5),
+                 mode='xyz'):
+    """Draw points on visualizer.
+
+    Args:
+        points (numpy.array | torch.tensor, shape=[N, 3+C]):
+            points to visualize.
+        vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
+        points_size (int, optional): the size of points to show on visualizer.
+            Default: 2.
+        point_color (tuple[float], optional): the color of points.
+            Default: (0.5, 0.5, 0.5).
+        mode (str, optional):  indicate type of the input points,
+            available mode ['xyz', 'xyzrgb']. Default: 'xyz'.
+
+    Returns:
+        tuple: points, color of each point.
+    """
+    vis.get_render_option().point_size = points_size  # set points size
+    if isinstance(points, torch.Tensor):
+        points = points.cpu().numpy()
+
+    points = points.copy()
+    pcd = geometry.PointCloud()
+    if mode == 'xyz':
+        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+        points_colors = np.tile(np.array(point_color), (points.shape[0], 1))
+    elif mode == 'xyzrgb':
+        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+        points_colors = points[:, 3:6]
+        # normalize to [0, 1] for open3d drawing
+        if not ((points_colors >= 0.0) & (points_colors <= 1.0)).all():
+            points_colors /= 255.0
+    else:
+        raise NotImplementedError
+
+    pcd.colors = o3d.utility.Vector3dVector(points_colors)
+    vis.add_geometry(pcd)
+
+    return pcd, points_colors
+
+
+def _draw_bboxes(bbox3d,
+                 vis,
+                 points_colors,
+                 pcd=None,
+                 bbox_color=(0, 1, 0),
+                 points_in_box_color=(1, 0, 0),
+                 rot_axis=2,
+                 center_mode='lidar_bottom',
+                 mode='xyz'):
+    """Draw bbox on visualizer and change the color of points inside bbox3d.
+
+    Args:
+        bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
+            3d bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize.
+        vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
+        points_colors (numpy.array): color of each points.
+        pcd (:obj:`open3d.geometry.PointCloud`, optional): point cloud.
+            Default: None.
+        bbox_color (tuple[float], optional): the color of bbox.
+            Default: (0, 1, 0).
+        points_in_box_color (tuple[float], optional):
+            the color of points inside bbox3d. Default: (1, 0, 0).
+        rot_axis (int, optional): rotation axis of bbox. Default: 2.
+        center_mode (bool, optional): indicate the center of bbox is
+            bottom center or gravity center. available mode
+            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+        mode (str, optional):  indicate type of the input points,
+            available mode ['xyz', 'xyzrgb']. Default: 'xyz'.
+    """
+    if isinstance(bbox3d, torch.Tensor):
+        bbox3d = bbox3d.cpu().numpy()
+    bbox3d = bbox3d.copy()
+
+    in_box_color = np.array(points_in_box_color)
+    for i in range(len(bbox3d)):
+        center = bbox3d[i, 0:3]
+        dim = bbox3d[i, 3:6]
+        yaw = np.zeros(3)
+        yaw[rot_axis] = bbox3d[i, 6]
+        rot_mat = geometry.get_rotation_matrix_from_xyz(yaw)
+
+        if center_mode == 'lidar_bottom':
+            center[rot_axis] += dim[
+                rot_axis] / 2  # bottom center to gravity center
+        elif center_mode == 'camera_bottom':
+            center[rot_axis] -= dim[
+                rot_axis] / 2  # bottom center to gravity center
+        box3d = geometry.OrientedBoundingBox(center, rot_mat, dim)
+
+        line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d)
+        line_set.paint_uniform_color(bbox_color)
+        # draw bboxes on visualizer
+        vis.add_geometry(line_set)
+
+        # change the color of points which are in box
+        if pcd is not None and mode == 'xyz':
+            indices = box3d.get_point_indices_within_bounding_box(pcd.points)
+            points_colors[indices] = in_box_color
+
+    # update points colors
+    if pcd is not None:
+        pcd.colors = o3d.utility.Vector3dVector(points_colors)
+        vis.update_geometry(pcd)
+
+
+def show_pts_boxes(points,
+                   bbox3d=None,
+                   show=True,
+                   save_path=None,
+                   points_size=2,
+                   point_color=(0.5, 0.5, 0.5),
+                   bbox_color=(0, 1, 0),
+                   points_in_box_color=(1, 0, 0),
+                   rot_axis=2,
+                   center_mode='lidar_bottom',
+                   mode='xyz'):
+    """Draw bbox and points on visualizer.
+
+    Args:
+        points (numpy.array | torch.tensor, shape=[N, 3+C]):
+            points to visualize.
+        bbox3d (numpy.array | torch.tensor, shape=[M, 7], optional):
+            3D bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize.
+            Defaults to None.
+        show (bool, optional): whether to show the visualization results.
+            Default: True.
+        save_path (str, optional): path to save visualized results.
+            Default: None.
+        points_size (int, optional): the size of points to show on visualizer.
+            Default: 2.
+        point_color (tuple[float], optional): the color of points.
+            Default: (0.5, 0.5, 0.5).
+        bbox_color (tuple[float], optional): the color of bbox.
+            Default: (0, 1, 0).
+        points_in_box_color (tuple[float], optional):
+            the color of points which are in bbox3d. Default: (1, 0, 0).
+        rot_axis (int, optional): rotation axis of bbox. Default: 2.
+        center_mode (bool, optional): indicate the center of bbox is bottom
+            center or gravity center. available mode
+            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+        mode (str, optional):  indicate type of the input points, available
+            mode ['xyz', 'xyzrgb']. Default: 'xyz'.
+    """
+    # TODO: support score and class info
+    assert 0 <= rot_axis <= 2
+
+    # init visualizer
+    vis = o3d.visualization.Visualizer()
+    vis.create_window()
+    mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+        size=1, origin=[0, 0, 0])  # create coordinate frame
+    vis.add_geometry(mesh_frame)
+
+    # draw points
+    pcd, points_colors = _draw_points(points, vis, points_size, point_color,
+                                      mode)
+
+    # draw boxes
+    if bbox3d is not None:
+        _draw_bboxes(bbox3d, vis, points_colors, pcd, bbox_color,
+                     points_in_box_color, rot_axis, center_mode, mode)
+
+    if show:
+        vis.run()
+
+    if save_path is not None:
+        vis.capture_screen_image(save_path)
+
+    vis.destroy_window()
+
+
+def _draw_bboxes_ind(bbox3d,
+                     vis,
+                     indices,
+                     points_colors,
+                     pcd=None,
+                     bbox_color=(0, 1, 0),
+                     points_in_box_color=(1, 0, 0),
+                     rot_axis=2,
+                     center_mode='lidar_bottom',
+                     mode='xyz'):
+    """Draw bbox on visualizer and change the color or points inside bbox3d
+    with indices.
+
+    Args:
+        bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
+            3d bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize.
+        vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
+        indices (numpy.array | torch.tensor, shape=[N, M]):
+            indicate which bbox3d that each point lies in.
+        points_colors (numpy.array): color of each points.
+        pcd (:obj:`open3d.geometry.PointCloud`, optional): point cloud.
+            Default: None.
+        bbox_color (tuple[float], optional): the color of bbox.
+            Default: (0, 1, 0).
+        points_in_box_color (tuple[float], optional):
+            the color of points which are in bbox3d. Default: (1, 0, 0).
+        rot_axis (int, optional): rotation axis of bbox. Default: 2.
+        center_mode (bool, optional): indicate the center of bbox is
+            bottom center or gravity center. available mode
+            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+        mode (str, optional):  indicate type of the input points,
+            available mode ['xyz', 'xyzrgb']. Default: 'xyz'.
+    """
+    if isinstance(bbox3d, torch.Tensor):
+        bbox3d = bbox3d.cpu().numpy()
+    if isinstance(indices, torch.Tensor):
+        indices = indices.cpu().numpy()
+    bbox3d = bbox3d.copy()
+
+    in_box_color = np.array(points_in_box_color)
+    for i in range(len(bbox3d)):
+        center = bbox3d[i, 0:3]
+        dim = bbox3d[i, 3:6]
+        yaw = np.zeros(3)
+        # TODO: fix problem of current coordinate system
+        # dim[0], dim[1] = dim[1], dim[0]  # for current coordinate
+        # yaw[rot_axis] = -(bbox3d[i, 6] - 0.5 * np.pi)
+        yaw[rot_axis] = -bbox3d[i, 6]
+        rot_mat = geometry.get_rotation_matrix_from_xyz(yaw)
+        if center_mode == 'lidar_bottom':
+            center[rot_axis] += dim[
+                rot_axis] / 2  # bottom center to gravity center
+        elif center_mode == 'camera_bottom':
+            center[rot_axis] -= dim[
+                rot_axis] / 2  # bottom center to gravity center
+        box3d = geometry.OrientedBoundingBox(center, rot_mat, dim)
+
+        line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d)
+        line_set.paint_uniform_color(bbox_color)
+        # draw bboxes on visualizer
+        vis.add_geometry(line_set)
+
+        # change the color of points which are in box
+        if pcd is not None and mode == 'xyz':
+            points_colors[indices[:, i].astype(np.bool)] = in_box_color
+
+    # update points colors
+    if pcd is not None:
+        pcd.colors = o3d.utility.Vector3dVector(points_colors)
+        vis.update_geometry(pcd)
+
+
+def show_pts_index_boxes(points,
+                         bbox3d=None,
+                         show=True,
+                         indices=None,
+                         save_path=None,
+                         points_size=2,
+                         point_color=(0.5, 0.5, 0.5),
+                         bbox_color=(0, 1, 0),
+                         points_in_box_color=(1, 0, 0),
+                         rot_axis=2,
+                         center_mode='lidar_bottom',
+                         mode='xyz'):
+    """Draw bbox and points on visualizer with indices that indicate which
+    bbox3d that each point lies in.
+
+    Args:
+        points (numpy.array | torch.tensor, shape=[N, 3+C]):
+            points to visualize.
+        bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
+            3D bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize.
+            Defaults to None.
+        show (bool, optional): whether to show the visualization results.
+            Default: True.
+        indices (numpy.array | torch.tensor, shape=[N, M], optional):
+            indicate which bbox3d that each point lies in. Default: None.
+        save_path (str, optional): path to save visualized results.
+            Default: None.
+        points_size (int, optional): the size of points to show on visualizer.
+            Default: 2.
+        point_color (tuple[float], optional): the color of points.
+            Default: (0.5, 0.5, 0.5).
+        bbox_color (tuple[float], optional): the color of bbox.
+            Default: (0, 1, 0).
+        points_in_box_color (tuple[float], optional):
+            the color of points which are in bbox3d. Default: (1, 0, 0).
+        rot_axis (int, optional): rotation axis of bbox. Default: 2.
+        center_mode (bool, optional): indicate the center of bbox is
+            bottom center or gravity center. available mode
+            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+        mode (str, optional):  indicate type of the input points,
+            available mode ['xyz', 'xyzrgb']. Default: 'xyz'.
+    """
+    # TODO: support score and class info
+    assert 0 <= rot_axis <= 2
+
+    # init visualizer
+    vis = o3d.visualization.Visualizer()
+    vis.create_window()
+    mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+        size=1, origin=[0, 0, 0])  # create coordinate frame
+    vis.add_geometry(mesh_frame)
+
+    # draw points
+    pcd, points_colors = _draw_points(points, vis, points_size, point_color,
+                                      mode)
+
+    # draw boxes
+    if bbox3d is not None:
+        _draw_bboxes_ind(bbox3d, vis, indices, points_colors, pcd, bbox_color,
+                         points_in_box_color, rot_axis, center_mode, mode)
+
+    if show:
+        vis.run()
+
+    if save_path is not None:
+        vis.capture_screen_image(save_path)
+
+    vis.destroy_window()
+
+
+class Visualizer(object):
+    r"""Online visualizer implemented with Open3d.
+
+    Args:
+        points (numpy.array, shape=[N, 3+C]): Points to visualize. The Points
+            cloud is in mode of Coord3DMode.DEPTH (please refer to
+            core.structures.coord_3d_mode).
+        bbox3d (numpy.array, shape=[M, 7], optional): 3D bbox
+            (x, y, z, x_size, y_size, z_size, yaw) to visualize.
+            The 3D bbox is in mode of Box3DMode.DEPTH with
+            gravity_center (please refer to core.structures.box_3d_mode).
+            Default: None.
+        save_path (str, optional): path to save visualized results.
+            Default: None.
+        points_size (int, optional): the size of points to show on visualizer.
+            Default: 2.
+        point_color (tuple[float], optional): the color of points.
+            Default: (0.5, 0.5, 0.5).
+        bbox_color (tuple[float], optional): the color of bbox.
+            Default: (0, 1, 0).
+        points_in_box_color (tuple[float], optional):
+            the color of points which are in bbox3d. Default: (1, 0, 0).
+        rot_axis (int, optional): rotation axis of bbox. Default: 2.
+        center_mode (bool, optional): indicate the center of bbox is
+            bottom center or gravity center. available mode
+            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+        mode (str, optional):  indicate type of the input points,
+            available mode ['xyz', 'xyzrgb']. Default: 'xyz'.
+    """
+
+    def __init__(self,
+                 points,
+                 bbox3d=None,
+                 save_path=None,
+                 points_size=2,
+                 point_color=(0.5, 0.5, 0.5),
+                 bbox_color=(0, 1, 0),
+                 points_in_box_color=(1, 0, 0),
+                 rot_axis=2,
+                 center_mode='lidar_bottom',
+                 mode='xyz'):
+        super(Visualizer, self).__init__()
+        assert 0 <= rot_axis <= 2
+
+        # init visualizer
+        self.o3d_visualizer = o3d.visualization.Visualizer()
+        self.o3d_visualizer.create_window()
+        mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+            size=1, origin=[0, 0, 0])  # create coordinate frame
+        self.o3d_visualizer.add_geometry(mesh_frame)
+
+        self.points_size = points_size
+        self.point_color = point_color
+        self.bbox_color = bbox_color
+        self.points_in_box_color = points_in_box_color
+        self.rot_axis = rot_axis
+        self.center_mode = center_mode
+        self.mode = mode
+        self.seg_num = 0
+
+        # draw points
+        if points is not None:
+            self.pcd, self.points_colors = _draw_points(
+                points, self.o3d_visualizer, points_size, point_color, mode)
+
+        # draw boxes
+        if bbox3d is not None:
+            _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors,
+                         self.pcd, bbox_color, points_in_box_color, rot_axis,
+                         center_mode, mode)
+
+    def add_bboxes(self, bbox3d, bbox_color=None, points_in_box_color=None):
+        """Add bounding box to visualizer.
+
+        Args:
+            bbox3d (numpy.array, shape=[M, 7]):
+                3D bbox (x, y, z, x_size, y_size, z_size, yaw)
+                to be visualized. The 3d bbox is in mode of
+                Box3DMode.DEPTH with gravity_center (please refer to
+                core.structures.box_3d_mode).
+            bbox_color (tuple[float]): the color of bbox. Default: None.
+            points_in_box_color (tuple[float]): the color of points which
+                are in bbox3d. Default: None.
+        """
+        if bbox_color is None:
+            bbox_color = self.bbox_color
+        if points_in_box_color is None:
+            points_in_box_color = self.points_in_box_color
+        _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors, self.pcd,
+                     bbox_color, points_in_box_color, self.rot_axis,
+                     self.center_mode, self.mode)
+
+    def add_seg_mask(self, seg_mask_colors):
+        """Add segmentation mask to visualizer via per-point colorization.
+
+        Args:
+            seg_mask_colors (numpy.array, shape=[N, 6]):
+                The segmentation mask whose first 3 dims are point coordinates
+                and last 3 dims are converted colors.
+        """
+        # we can't draw the colors on existing points
+        # in case gt and pred mask would overlap
+        # instead we set a large offset along x-axis for each seg mask
+        self.seg_num += 1
+        offset = (np.array(self.pcd.points).max(0) -
+                  np.array(self.pcd.points).min(0))[0] * 1.2 * self.seg_num
+        mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+            size=1, origin=[offset, 0, 0])  # create coordinate frame for seg
+        self.o3d_visualizer.add_geometry(mesh_frame)
+        seg_points = copy.deepcopy(seg_mask_colors)
+        seg_points[:, 0] += offset
+        _draw_points(
+            seg_points, self.o3d_visualizer, self.points_size, mode='xyzrgb')
+
+    def show(self, save_path=None):
+        """Visualize the points cloud.
+
+        Args:
+            save_path (str, optional): path to save image. Default: None.
+        """
+
+        self.o3d_visualizer.run()
+
+        if save_path is not None:
+            self.o3d_visualizer.capture_screen_image(save_path)
+
+        self.o3d_visualizer.destroy_window()
+        return
diff --git a/mmdet3d/core/visualizer/show_result.py b/mmdet3d/core/visualizer/show_result.py
index aa732cf..f6614b5 100644
--- a/mmdet3d/core/visualizer/show_result.py
+++ b/mmdet3d/core/visualizer/show_result.py
@@ -1,291 +1,291 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from os import path as osp
-
-import mmcv
-import numpy as np
-import trimesh
-
-from .image_vis import (draw_camera_bbox3d_on_img, draw_depth_bbox3d_on_img,
-                        draw_lidar_bbox3d_on_img)
-
-
-def _write_obj(points, out_filename):
-    """Write points into ``obj`` format for meshlab visualization.
-
-    Args:
-        points (np.ndarray): Points in shape (N, dim).
-        out_filename (str): Filename to be saved.
-    """
-    N = points.shape[0]
-    fout = open(out_filename, 'w')
-    for i in range(N):
-        if points.shape[1] == 6:
-            c = points[i, 3:].astype(int)
-            fout.write(
-                'v %f %f %f %d %d %d\n' %
-                (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]))
-
-        else:
-            fout.write('v %f %f %f\n' %
-                       (points[i, 0], points[i, 1], points[i, 2]))
-    fout.close()
-
-
-def _write_oriented_bbox(scene_bbox, out_filename):
-    """Export oriented (around Z axis) scene bbox to meshes.
-
-    Args:
-        scene_bbox(list[ndarray] or ndarray): xyz pos of center and
-            3 lengths (x_size, y_size, z_size) and heading angle around Z axis.
-            Y forward, X right, Z upward. heading angle of positive X is 0,
-            heading angle of positive Y is 90 degrees.
-        out_filename(str): Filename.
-    """
-
-    def heading2rotmat(heading_angle):
-        rotmat = np.zeros((3, 3))
-        rotmat[2, 2] = 1
-        cosval = np.cos(heading_angle)
-        sinval = np.sin(heading_angle)
-        rotmat[0:2, 0:2] = np.array([[cosval, -sinval], [sinval, cosval]])
-        return rotmat
-
-    def convert_oriented_box_to_trimesh_fmt(box):
-        ctr = box[:3]
-        lengths = box[3:6]
-        trns = np.eye(4)
-        trns[0:3, 3] = ctr
-        trns[3, 3] = 1.0
-        trns[0:3, 0:3] = heading2rotmat(box[6])
-        box_trimesh_fmt = trimesh.creation.box(lengths, trns)
-        return box_trimesh_fmt
-
-    if len(scene_bbox) == 0:
-        scene_bbox = np.zeros((1, 7))
-    scene = trimesh.scene.Scene()
-    for box in scene_bbox:
-        scene.add_geometry(convert_oriented_box_to_trimesh_fmt(box))
-
-    mesh_list = trimesh.util.concatenate(scene.dump())
-    # save to obj file
-    trimesh.io.export.export_mesh(mesh_list, out_filename, file_type='obj')
-
-    return
-
-
-def show_result(points,
-                gt_bboxes,
-                pred_bboxes,
-                out_dir,
-                filename,
-                show=False,
-                snapshot=False,
-                pred_labels=None):
-    """Convert results into format that is directly readable for meshlab.
-
-    Args:
-        points (np.ndarray): Points.
-        gt_bboxes (np.ndarray): Ground truth boxes.
-        pred_bboxes (np.ndarray): Predicted boxes.
-        out_dir (str): Path of output directory
-        filename (str): Filename of the current frame.
-        show (bool, optional): Visualize the results online. Defaults to False.
-        snapshot (bool, optional): Whether to save the online results.
-            Defaults to False.
-        pred_labels (np.ndarray, optional): Predicted labels of boxes.
-            Defaults to None.
-    """
-    result_path = osp.join(out_dir, filename)
-    mmcv.mkdir_or_exist(result_path)
-
-    if show:
-        from .open3d_vis import Visualizer
-
-        vis = Visualizer(points)
-        if pred_bboxes is not None:
-            if pred_labels is None:
-                vis.add_bboxes(bbox3d=pred_bboxes)
-            else:
-                palette = np.random.randint(
-                    0, 255, size=(pred_labels.max() + 1, 3)) / 256
-                labelDict = {}
-                for j in range(len(pred_labels)):
-                    i = int(pred_labels[j].numpy())
-                    if labelDict.get(i) is None:
-                        labelDict[i] = []
-                    labelDict[i].append(pred_bboxes[j])
-                for i in labelDict:
-                    vis.add_bboxes(
-                        bbox3d=np.array(labelDict[i]),
-                        bbox_color=palette[i],
-                        points_in_box_color=palette[i])
-
-        if gt_bboxes is not None:
-            vis.add_bboxes(bbox3d=gt_bboxes, bbox_color=(0, 0, 1))
-        show_path = osp.join(result_path,
-                             f'{filename}_online.png') if snapshot else None
-        vis.show(show_path)
-
-    if points is not None:
-        _write_obj(points, osp.join(result_path, f'{filename}_points.obj'))
-
-    if gt_bboxes is not None:
-        # bottom center to gravity center
-        gt_bboxes[..., 2] += gt_bboxes[..., 5] / 2
-
-        _write_oriented_bbox(gt_bboxes,
-                             osp.join(result_path, f'{filename}_gt.obj'))
-
-    if pred_bboxes is not None:
-        # bottom center to gravity center
-        pred_bboxes[..., 2] += pred_bboxes[..., 5] / 2
-
-        _write_oriented_bbox(pred_bboxes,
-                             osp.join(result_path, f'{filename}_pred.obj'))
-
-
-def show_seg_result(points,
-                    gt_seg,
-                    pred_seg,
-                    out_dir,
-                    filename,
-                    palette,
-                    ignore_index=None,
-                    show=False,
-                    snapshot=False):
-    """Convert results into format that is directly readable for meshlab.
-
-    Args:
-        points (np.ndarray): Points.
-        gt_seg (np.ndarray): Ground truth segmentation mask.
-        pred_seg (np.ndarray): Predicted segmentation mask.
-        out_dir (str): Path of output directory
-        filename (str): Filename of the current frame.
-        palette (np.ndarray): Mapping between class labels and colors.
-        ignore_index (int, optional): The label index to be ignored, e.g.
-            unannotated points. Defaults to None.
-        show (bool, optional): Visualize the results online. Defaults to False.
-        snapshot (bool, optional): Whether to save the online results.
-            Defaults to False.
-    """
-    # we need 3D coordinates to visualize segmentation mask
-    if gt_seg is not None or pred_seg is not None:
-        assert points is not None, \
-            '3D coordinates are required for segmentation visualization'
-
-    # filter out ignored points
-    if gt_seg is not None and ignore_index is not None:
-        if points is not None:
-            points = points[gt_seg != ignore_index]
-        if pred_seg is not None:
-            pred_seg = pred_seg[gt_seg != ignore_index]
-        gt_seg = gt_seg[gt_seg != ignore_index]
-
-    if gt_seg is not None:
-        gt_seg_color = palette[gt_seg]
-        gt_seg_color = np.concatenate([points[:, :3], gt_seg_color], axis=1)
-    if pred_seg is not None:
-        pred_seg_color = palette[pred_seg]
-        pred_seg_color = np.concatenate([points[:, :3], pred_seg_color],
-                                        axis=1)
-
-    result_path = osp.join(out_dir, filename)
-    mmcv.mkdir_or_exist(result_path)
-
-    # online visualization of segmentation mask
-    # we show three masks in a row, scene_points, gt_mask, pred_mask
-    if show:
-        from .open3d_vis import Visualizer
-        mode = 'xyzrgb' if points.shape[1] == 6 else 'xyz'
-        vis = Visualizer(points, mode=mode)
-        if gt_seg is not None:
-            vis.add_seg_mask(gt_seg_color)
-        if pred_seg is not None:
-            vis.add_seg_mask(pred_seg_color)
-        show_path = osp.join(result_path,
-                             f'{filename}_online.png') if snapshot else None
-        vis.show(show_path)
-
-    if points is not None:
-        _write_obj(points, osp.join(result_path, f'{filename}_points.obj'))
-
-    if gt_seg is not None:
-        _write_obj(gt_seg_color, osp.join(result_path, f'{filename}_gt.obj'))
-
-    if pred_seg is not None:
-        _write_obj(pred_seg_color, osp.join(result_path,
-                                            f'{filename}_pred.obj'))
-
-
-def show_multi_modality_result(img,
-                               gt_bboxes,
-                               pred_bboxes,
-                               proj_mat,
-                               out_dir,
-                               filename,
-                               box_mode='lidar',
-                               img_metas=None,
-                               show=False,
-                               gt_bbox_color=(61, 102, 255),
-                               pred_bbox_color=(241, 101, 72)):
-    """Convert multi-modality detection results into 2D results.
-
-    Project the predicted 3D bbox to 2D image plane and visualize them.
-
-    Args:
-        img (np.ndarray): The numpy array of image in cv2 fashion.
-        gt_bboxes (:obj:`BaseInstance3DBoxes`): Ground truth boxes.
-        pred_bboxes (:obj:`BaseInstance3DBoxes`): Predicted boxes.
-        proj_mat (numpy.array, shape=[4, 4]): The projection matrix
-            according to the camera intrinsic parameters.
-        out_dir (str): Path of output directory.
-        filename (str): Filename of the current frame.
-        box_mode (str, optional): Coordinate system the boxes are in.
-            Should be one of 'depth', 'lidar' and 'camera'.
-            Defaults to 'lidar'.
-        img_metas (dict, optional): Used in projecting depth bbox.
-            Defaults to None.
-        show (bool, optional): Visualize the results online. Defaults to False.
-        gt_bbox_color (str or tuple(int), optional): Color of bbox lines.
-           The tuple of color should be in BGR order. Default: (255, 102, 61).
-        pred_bbox_color (str or tuple(int), optional): Color of bbox lines.
-           The tuple of color should be in BGR order. Default: (72, 101, 241).
-    """
-    if box_mode == 'depth':
-        draw_bbox = draw_depth_bbox3d_on_img
-    elif box_mode == 'lidar':
-        draw_bbox = draw_lidar_bbox3d_on_img
-    elif box_mode == 'camera':
-        draw_bbox = draw_camera_bbox3d_on_img
-    else:
-        raise NotImplementedError(f'unsupported box mode {box_mode}')
-
-    result_path = osp.join(out_dir, filename)
-    mmcv.mkdir_or_exist(result_path)
-
-    if show:
-        show_img = img.copy()
-        if gt_bboxes is not None:
-            show_img = draw_bbox(
-                gt_bboxes, show_img, proj_mat, img_metas, color=gt_bbox_color)
-        if pred_bboxes is not None:
-            show_img = draw_bbox(
-                pred_bboxes,
-                show_img,
-                proj_mat,
-                img_metas,
-                color=pred_bbox_color)
-        mmcv.imshow(show_img, win_name='project_bbox3d_img', wait_time=0)
-
-    if img is not None:
-        mmcv.imwrite(img, osp.join(result_path, f'{filename}_img.png'))
-
-    if gt_bboxes is not None:
-        gt_img = draw_bbox(
-            gt_bboxes, img, proj_mat, img_metas, color=gt_bbox_color)
-        mmcv.imwrite(gt_img, osp.join(result_path, f'{filename}_gt.png'))
-
-    if pred_bboxes is not None:
-        pred_img = draw_bbox(
-            pred_bboxes, img, proj_mat, img_metas, color=pred_bbox_color)
-        mmcv.imwrite(pred_img, osp.join(result_path, f'{filename}_pred.png'))
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+
+import mmcv
+import numpy as np
+import trimesh
+
+from .image_vis import (draw_camera_bbox3d_on_img, draw_depth_bbox3d_on_img,
+                        draw_lidar_bbox3d_on_img)
+
+
+def _write_obj(points, out_filename):
+    """Write points into ``obj`` format for meshlab visualization.
+
+    Args:
+        points (np.ndarray): Points in shape (N, dim).
+        out_filename (str): Filename to be saved.
+    """
+    N = points.shape[0]
+    fout = open(out_filename, 'w')
+    for i in range(N):
+        if points.shape[1] == 6:
+            c = points[i, 3:].astype(int)
+            fout.write(
+                'v %f %f %f %d %d %d\n' %
+                (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]))
+
+        else:
+            fout.write('v %f %f %f\n' %
+                       (points[i, 0], points[i, 1], points[i, 2]))
+    fout.close()
+
+
+def _write_oriented_bbox(scene_bbox, out_filename):
+    """Export oriented (around Z axis) scene bbox to meshes.
+
+    Args:
+        scene_bbox(list[ndarray] or ndarray): xyz pos of center and
+            3 lengths (x_size, y_size, z_size) and heading angle around Z axis.
+            Y forward, X right, Z upward. heading angle of positive X is 0,
+            heading angle of positive Y is 90 degrees.
+        out_filename(str): Filename.
+    """
+
+    def heading2rotmat(heading_angle):
+        rotmat = np.zeros((3, 3))
+        rotmat[2, 2] = 1
+        cosval = np.cos(heading_angle)
+        sinval = np.sin(heading_angle)
+        rotmat[0:2, 0:2] = np.array([[cosval, -sinval], [sinval, cosval]])
+        return rotmat
+
+    def convert_oriented_box_to_trimesh_fmt(box):
+        ctr = box[:3]
+        lengths = box[3:6]
+        trns = np.eye(4)
+        trns[0:3, 3] = ctr
+        trns[3, 3] = 1.0
+        trns[0:3, 0:3] = heading2rotmat(box[6])
+        box_trimesh_fmt = trimesh.creation.box(lengths, trns)
+        return box_trimesh_fmt
+
+    if len(scene_bbox) == 0:
+        scene_bbox = np.zeros((1, 7))
+    scene = trimesh.scene.Scene()
+    for box in scene_bbox:
+        scene.add_geometry(convert_oriented_box_to_trimesh_fmt(box))
+
+    mesh_list = trimesh.util.concatenate(scene.dump())
+    # save to obj file
+    trimesh.io.export.export_mesh(mesh_list, out_filename, file_type='obj')
+
+    return
+
+
+def show_result(points,
+                gt_bboxes,
+                pred_bboxes,
+                out_dir,
+                filename,
+                show=False,
+                snapshot=False,
+                pred_labels=None):
+    """Convert results into format that is directly readable for meshlab.
+
+    Args:
+        points (np.ndarray): Points.
+        gt_bboxes (np.ndarray): Ground truth boxes.
+        pred_bboxes (np.ndarray): Predicted boxes.
+        out_dir (str): Path of output directory
+        filename (str): Filename of the current frame.
+        show (bool, optional): Visualize the results online. Defaults to False.
+        snapshot (bool, optional): Whether to save the online results.
+            Defaults to False.
+        pred_labels (np.ndarray, optional): Predicted labels of boxes.
+            Defaults to None.
+    """
+    result_path = osp.join(out_dir, filename)
+    mmcv.mkdir_or_exist(result_path)
+
+    if show:
+        from .open3d_vis import Visualizer
+
+        vis = Visualizer(points)
+        if pred_bboxes is not None:
+            if pred_labels is None:
+                vis.add_bboxes(bbox3d=pred_bboxes)
+            else:
+                palette = np.random.randint(
+                    0, 255, size=(pred_labels.max() + 1, 3)) / 256
+                labelDict = {}
+                for j in range(len(pred_labels)):
+                    i = int(pred_labels[j].numpy())
+                    if labelDict.get(i) is None:
+                        labelDict[i] = []
+                    labelDict[i].append(pred_bboxes[j])
+                for i in labelDict:
+                    vis.add_bboxes(
+                        bbox3d=np.array(labelDict[i]),
+                        bbox_color=palette[i],
+                        points_in_box_color=palette[i])
+
+        if gt_bboxes is not None:
+            vis.add_bboxes(bbox3d=gt_bboxes, bbox_color=(0, 0, 1))
+        show_path = osp.join(result_path,
+                             f'{filename}_online.png') if snapshot else None
+        vis.show(show_path)
+
+    if points is not None:
+        _write_obj(points, osp.join(result_path, f'{filename}_points.obj'))
+
+    if gt_bboxes is not None:
+        # bottom center to gravity center
+        gt_bboxes[..., 2] += gt_bboxes[..., 5] / 2
+
+        _write_oriented_bbox(gt_bboxes,
+                             osp.join(result_path, f'{filename}_gt.obj'))
+
+    if pred_bboxes is not None:
+        # bottom center to gravity center
+        pred_bboxes[..., 2] += pred_bboxes[..., 5] / 2
+
+        _write_oriented_bbox(pred_bboxes,
+                             osp.join(result_path, f'{filename}_pred.obj'))
+
+
+def show_seg_result(points,
+                    gt_seg,
+                    pred_seg,
+                    out_dir,
+                    filename,
+                    palette,
+                    ignore_index=None,
+                    show=False,
+                    snapshot=False):
+    """Convert results into format that is directly readable for meshlab.
+
+    Args:
+        points (np.ndarray): Points.
+        gt_seg (np.ndarray): Ground truth segmentation mask.
+        pred_seg (np.ndarray): Predicted segmentation mask.
+        out_dir (str): Path of output directory
+        filename (str): Filename of the current frame.
+        palette (np.ndarray): Mapping between class labels and colors.
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. Defaults to None.
+        show (bool, optional): Visualize the results online. Defaults to False.
+        snapshot (bool, optional): Whether to save the online results.
+            Defaults to False.
+    """
+    # we need 3D coordinates to visualize segmentation mask
+    if gt_seg is not None or pred_seg is not None:
+        assert points is not None, \
+            '3D coordinates are required for segmentation visualization'
+
+    # filter out ignored points
+    if gt_seg is not None and ignore_index is not None:
+        if points is not None:
+            points = points[gt_seg != ignore_index]
+        if pred_seg is not None:
+            pred_seg = pred_seg[gt_seg != ignore_index]
+        gt_seg = gt_seg[gt_seg != ignore_index]
+
+    if gt_seg is not None:
+        gt_seg_color = palette[gt_seg]
+        gt_seg_color = np.concatenate([points[:, :3], gt_seg_color], axis=1)
+    if pred_seg is not None:
+        pred_seg_color = palette[pred_seg]
+        pred_seg_color = np.concatenate([points[:, :3], pred_seg_color],
+                                        axis=1)
+
+    result_path = osp.join(out_dir, filename)
+    mmcv.mkdir_or_exist(result_path)
+
+    # online visualization of segmentation mask
+    # we show three masks in a row, scene_points, gt_mask, pred_mask
+    if show:
+        from .open3d_vis import Visualizer
+        mode = 'xyzrgb' if points.shape[1] == 6 else 'xyz'
+        vis = Visualizer(points, mode=mode)
+        if gt_seg is not None:
+            vis.add_seg_mask(gt_seg_color)
+        if pred_seg is not None:
+            vis.add_seg_mask(pred_seg_color)
+        show_path = osp.join(result_path,
+                             f'{filename}_online.png') if snapshot else None
+        vis.show(show_path)
+
+    if points is not None:
+        _write_obj(points, osp.join(result_path, f'{filename}_points.obj'))
+
+    if gt_seg is not None:
+        _write_obj(gt_seg_color, osp.join(result_path, f'{filename}_gt.obj'))
+
+    if pred_seg is not None:
+        _write_obj(pred_seg_color, osp.join(result_path,
+                                            f'{filename}_pred.obj'))
+
+
+def show_multi_modality_result(img,
+                               gt_bboxes,
+                               pred_bboxes,
+                               proj_mat,
+                               out_dir,
+                               filename,
+                               box_mode='lidar',
+                               img_metas=None,
+                               show=False,
+                               gt_bbox_color=(61, 102, 255),
+                               pred_bbox_color=(241, 101, 72)):
+    """Convert multi-modality detection results into 2D results.
+
+    Project the predicted 3D bbox to 2D image plane and visualize them.
+
+    Args:
+        img (np.ndarray): The numpy array of image in cv2 fashion.
+        gt_bboxes (:obj:`BaseInstance3DBoxes`): Ground truth boxes.
+        pred_bboxes (:obj:`BaseInstance3DBoxes`): Predicted boxes.
+        proj_mat (numpy.array, shape=[4, 4]): The projection matrix
+            according to the camera intrinsic parameters.
+        out_dir (str): Path of output directory.
+        filename (str): Filename of the current frame.
+        box_mode (str, optional): Coordinate system the boxes are in.
+            Should be one of 'depth', 'lidar' and 'camera'.
+            Defaults to 'lidar'.
+        img_metas (dict, optional): Used in projecting depth bbox.
+            Defaults to None.
+        show (bool, optional): Visualize the results online. Defaults to False.
+        gt_bbox_color (str or tuple(int), optional): Color of bbox lines.
+           The tuple of color should be in BGR order. Default: (255, 102, 61).
+        pred_bbox_color (str or tuple(int), optional): Color of bbox lines.
+           The tuple of color should be in BGR order. Default: (72, 101, 241).
+    """
+    if box_mode == 'depth':
+        draw_bbox = draw_depth_bbox3d_on_img
+    elif box_mode == 'lidar':
+        draw_bbox = draw_lidar_bbox3d_on_img
+    elif box_mode == 'camera':
+        draw_bbox = draw_camera_bbox3d_on_img
+    else:
+        raise NotImplementedError(f'unsupported box mode {box_mode}')
+
+    result_path = osp.join(out_dir, filename)
+    mmcv.mkdir_or_exist(result_path)
+
+    if show:
+        show_img = img.copy()
+        if gt_bboxes is not None:
+            show_img = draw_bbox(
+                gt_bboxes, show_img, proj_mat, img_metas, color=gt_bbox_color)
+        if pred_bboxes is not None:
+            show_img = draw_bbox(
+                pred_bboxes,
+                show_img,
+                proj_mat,
+                img_metas,
+                color=pred_bbox_color)
+        mmcv.imshow(show_img, win_name='project_bbox3d_img', wait_time=0)
+
+    if img is not None:
+        mmcv.imwrite(img, osp.join(result_path, f'{filename}_img.png'))
+
+    if gt_bboxes is not None:
+        gt_img = draw_bbox(
+            gt_bboxes, img, proj_mat, img_metas, color=gt_bbox_color)
+        mmcv.imwrite(gt_img, osp.join(result_path, f'{filename}_gt.png'))
+
+    if pred_bboxes is not None:
+        pred_img = draw_bbox(
+            pred_bboxes, img, proj_mat, img_metas, color=pred_bbox_color)
+        mmcv.imwrite(pred_img, osp.join(result_path, f'{filename}_pred.png'))
diff --git a/mmdet3d/core/voxel/__init__.py b/mmdet3d/core/voxel/__init__.py
index 8d69543..7aa3cb1 100644
--- a/mmdet3d/core/voxel/__init__.py
+++ b/mmdet3d/core/voxel/__init__.py
@@ -1,5 +1,5 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .builder import build_voxel_generator
-from .voxel_generator import VoxelGenerator
-
-__all__ = ['build_voxel_generator', 'VoxelGenerator']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_voxel_generator
+from .voxel_generator import VoxelGenerator
+
+__all__ = ['build_voxel_generator', 'VoxelGenerator']
diff --git a/mmdet3d/core/voxel/builder.py b/mmdet3d/core/voxel/builder.py
index bc663ee..cec47eb 100644
--- a/mmdet3d/core/voxel/builder.py
+++ b/mmdet3d/core/voxel/builder.py
@@ -1,16 +1,16 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
-
-from . import voxel_generator
-
-
-def build_voxel_generator(cfg, **kwargs):
-    """Builder of voxel generator."""
-    if isinstance(cfg, voxel_generator.VoxelGenerator):
-        return cfg
-    elif isinstance(cfg, dict):
-        return mmcv.runner.obj_from_dict(
-            cfg, voxel_generator, default_args=kwargs)
-    else:
-        raise TypeError('Invalid type {} for building a sampler'.format(
-            type(cfg)))
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+
+from . import voxel_generator
+
+
+def build_voxel_generator(cfg, **kwargs):
+    """Builder of voxel generator."""
+    if isinstance(cfg, voxel_generator.VoxelGenerator):
+        return cfg
+    elif isinstance(cfg, dict):
+        return mmcv.runner.obj_from_dict(
+            cfg, voxel_generator, default_args=kwargs)
+    else:
+        raise TypeError('Invalid type {} for building a sampler'.format(
+            type(cfg)))
diff --git a/mmdet3d/core/voxel/voxel_generator.py b/mmdet3d/core/voxel/voxel_generator.py
index 404f2cd..dc42690 100644
--- a/mmdet3d/core/voxel/voxel_generator.py
+++ b/mmdet3d/core/voxel/voxel_generator.py
@@ -1,280 +1,280 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numba
-import numpy as np
-
-
-class VoxelGenerator(object):
-    """Voxel generator in numpy implementation.
-
-    Args:
-        voxel_size (list[float]): Size of a single voxel
-        point_cloud_range (list[float]): Range of points
-        max_num_points (int): Maximum number of points in a single voxel
-        max_voxels (int, optional): Maximum number of voxels.
-            Defaults to 20000.
-    """
-
-    def __init__(self,
-                 voxel_size,
-                 point_cloud_range,
-                 max_num_points,
-                 max_voxels=20000):
-
-        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
-        # [0, -40, -3, 70.4, 40, 1]
-        voxel_size = np.array(voxel_size, dtype=np.float32)
-        grid_size = (point_cloud_range[3:] -
-                     point_cloud_range[:3]) / voxel_size
-        grid_size = np.round(grid_size).astype(np.int64)
-
-        self._voxel_size = voxel_size
-        self._point_cloud_range = point_cloud_range
-        self._max_num_points = max_num_points
-        self._max_voxels = max_voxels
-        self._grid_size = grid_size
-
-    def generate(self, points):
-        """Generate voxels given points."""
-        return points_to_voxel(points, self._voxel_size,
-                               self._point_cloud_range, self._max_num_points,
-                               True, self._max_voxels)
-
-    @property
-    def voxel_size(self):
-        """list[float]: Size of a single voxel."""
-        return self._voxel_size
-
-    @property
-    def max_num_points_per_voxel(self):
-        """int: Maximum number of points per voxel."""
-        return self._max_num_points
-
-    @property
-    def point_cloud_range(self):
-        """list[float]: Range of point cloud."""
-        return self._point_cloud_range
-
-    @property
-    def grid_size(self):
-        """np.ndarray: The size of grids."""
-        return self._grid_size
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        indent = ' ' * (len(repr_str) + 1)
-        repr_str += f'(voxel_size={self._voxel_size},\n'
-        repr_str += indent + 'point_cloud_range='
-        repr_str += f'{self._point_cloud_range.tolist()},\n'
-        repr_str += indent + f'max_num_points={self._max_num_points},\n'
-        repr_str += indent + f'max_voxels={self._max_voxels},\n'
-        repr_str += indent + f'grid_size={self._grid_size.tolist()}'
-        repr_str += ')'
-        return repr_str
-
-
-def points_to_voxel(points,
-                    voxel_size,
-                    coors_range,
-                    max_points=35,
-                    reverse_index=True,
-                    max_voxels=20000):
-    """convert kitti points(N, >=3) to voxels.
-
-    Args:
-        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and
-            points[:, 3:] contain other information such as reflectivity.
-        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size
-        coors_range (list[float | tuple[float] | ndarray]): Voxel range.
-            format: xyzxyz, minmax
-        max_points (int): Indicate maximum points contained in a voxel.
-        reverse_index (bool): Whether return reversed coordinates.
-            if points has xyz format and reverse_index is True, output
-            coordinates will be zyx format, but points in features always
-            xyz format.
-        max_voxels (int): Maximum number of voxels this function creates.
-            For second, 20000 is a good choice. Points should be shuffled for
-            randomness before this function because max_voxels drops points.
-
-    Returns:
-        tuple[np.ndarray]:
-            voxels: [M, max_points, ndim] float tensor. only contain points.
-            coordinates: [M, 3] int32 tensor.
-            num_points_per_voxel: [M] int32 tensor.
-    """
-    if not isinstance(voxel_size, np.ndarray):
-        voxel_size = np.array(voxel_size, dtype=points.dtype)
-    if not isinstance(coors_range, np.ndarray):
-        coors_range = np.array(coors_range, dtype=points.dtype)
-    voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
-    voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
-    if reverse_index:
-        voxelmap_shape = voxelmap_shape[::-1]
-    # don't create large array in jit(nopython=True) code.
-    num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
-    coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32)
-    voxels = np.zeros(
-        shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
-    coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
-    if reverse_index:
-        voxel_num = _points_to_voxel_reverse_kernel(
-            points, voxel_size, coors_range, num_points_per_voxel,
-            coor_to_voxelidx, voxels, coors, max_points, max_voxels)
-
-    else:
-        voxel_num = _points_to_voxel_kernel(points, voxel_size, coors_range,
-                                            num_points_per_voxel,
-                                            coor_to_voxelidx, voxels, coors,
-                                            max_points, max_voxels)
-
-    coors = coors[:voxel_num]
-    voxels = voxels[:voxel_num]
-    num_points_per_voxel = num_points_per_voxel[:voxel_num]
-
-    return voxels, coors, num_points_per_voxel
-
-
-@numba.jit(nopython=True)
-def _points_to_voxel_reverse_kernel(points,
-                                    voxel_size,
-                                    coors_range,
-                                    num_points_per_voxel,
-                                    coor_to_voxelidx,
-                                    voxels,
-                                    coors,
-                                    max_points=35,
-                                    max_voxels=20000):
-    """convert kitti points(N, >=3) to voxels.
-
-    Args:
-        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and
-            points[:, 3:] contain other information such as reflectivity.
-        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size
-        coors_range (list[float | tuple[float] | ndarray]): Range of voxels.
-            format: xyzxyz, minmax
-        num_points_per_voxel (int): Number of points per voxel.
-        coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W),
-            which has the same shape as the complete voxel map. It indicates
-            the index of each corresponding voxel.
-        voxels (np.ndarray): Created empty voxels.
-        coors (np.ndarray): Created coordinates of each voxel.
-        max_points (int): Indicate maximum points contained in a voxel.
-        max_voxels (int): Maximum number of voxels this function create.
-            for second, 20000 is a good choice. Points should be shuffled for
-            randomness before this function because max_voxels drops points.
-
-    Returns:
-        tuple[np.ndarray]:
-            voxels: Shape [M, max_points, ndim], only contain points.
-            coordinates: Shape [M, 3].
-            num_points_per_voxel: Shape [M].
-    """
-    # put all computations to one loop.
-    # we shouldn't create large array in main jit code, otherwise
-    # reduce performance
-    N = points.shape[0]
-    # ndim = points.shape[1] - 1
-    ndim = 3
-    ndim_minus_1 = ndim - 1
-    grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
-    # np.round(grid_size)
-    # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
-    grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
-    coor = np.zeros(shape=(3, ), dtype=np.int32)
-    voxel_num = 0
-    failed = False
-    for i in range(N):
-        failed = False
-        for j in range(ndim):
-            c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
-            if c < 0 or c >= grid_size[j]:
-                failed = True
-                break
-            coor[ndim_minus_1 - j] = c
-        if failed:
-            continue
-        voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
-        if voxelidx == -1:
-            voxelidx = voxel_num
-            if voxel_num >= max_voxels:
-                continue
-            voxel_num += 1
-            coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
-            coors[voxelidx] = coor
-        num = num_points_per_voxel[voxelidx]
-        if num < max_points:
-            voxels[voxelidx, num] = points[i]
-            num_points_per_voxel[voxelidx] += 1
-    return voxel_num
-
-
-@numba.jit(nopython=True)
-def _points_to_voxel_kernel(points,
-                            voxel_size,
-                            coors_range,
-                            num_points_per_voxel,
-                            coor_to_voxelidx,
-                            voxels,
-                            coors,
-                            max_points=35,
-                            max_voxels=20000):
-    """convert kitti points(N, >=3) to voxels.
-
-    Args:
-        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and
-            points[:, 3:] contain other information such as reflectivity.
-        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size.
-        coors_range (list[float | tuple[float] | ndarray]): Range of voxels.
-            format: xyzxyz, minmax
-        num_points_per_voxel (int): Number of points per voxel.
-        coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W),
-            which has the same shape as the complete voxel map. It indicates
-            the index of each corresponding voxel.
-        voxels (np.ndarray): Created empty voxels.
-        coors (np.ndarray): Created coordinates of each voxel.
-        max_points (int): Indicate maximum points contained in a voxel.
-        max_voxels (int): Maximum number of voxels this function create.
-            for second, 20000 is a good choice. Points should be shuffled for
-            randomness before this function because max_voxels drops points.
-
-    Returns:
-        tuple[np.ndarray]:
-            voxels: Shape [M, max_points, ndim], only contain points.
-            coordinates: Shape [M, 3].
-            num_points_per_voxel: Shape [M].
-    """
-    N = points.shape[0]
-    # ndim = points.shape[1] - 1
-    ndim = 3
-    grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
-    # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
-    grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
-
-    # lower_bound = coors_range[:3]
-    # upper_bound = coors_range[3:]
-    coor = np.zeros(shape=(3, ), dtype=np.int32)
-    voxel_num = 0
-    failed = False
-    for i in range(N):
-        failed = False
-        for j in range(ndim):
-            c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
-            if c < 0 or c >= grid_size[j]:
-                failed = True
-                break
-            coor[j] = c
-        if failed:
-            continue
-        voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
-        if voxelidx == -1:
-            voxelidx = voxel_num
-            if voxel_num >= max_voxels:
-                continue
-            voxel_num += 1
-            coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
-            coors[voxelidx] = coor
-        num = num_points_per_voxel[voxelidx]
-        if num < max_points:
-            voxels[voxelidx, num] = points[i]
-            num_points_per_voxel[voxelidx] += 1
-    return voxel_num
+# Copyright (c) OpenMMLab. All rights reserved.
+import numba
+import numpy as np
+
+
+class VoxelGenerator(object):
+    """Voxel generator in numpy implementation.
+
+    Args:
+        voxel_size (list[float]): Size of a single voxel
+        point_cloud_range (list[float]): Range of points
+        max_num_points (int): Maximum number of points in a single voxel
+        max_voxels (int, optional): Maximum number of voxels.
+            Defaults to 20000.
+    """
+
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels=20000):
+
+        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
+        # [0, -40, -3, 70.4, 40, 1]
+        voxel_size = np.array(voxel_size, dtype=np.float32)
+        grid_size = (point_cloud_range[3:] -
+                     point_cloud_range[:3]) / voxel_size
+        grid_size = np.round(grid_size).astype(np.int64)
+
+        self._voxel_size = voxel_size
+        self._point_cloud_range = point_cloud_range
+        self._max_num_points = max_num_points
+        self._max_voxels = max_voxels
+        self._grid_size = grid_size
+
+    def generate(self, points):
+        """Generate voxels given points."""
+        return points_to_voxel(points, self._voxel_size,
+                               self._point_cloud_range, self._max_num_points,
+                               True, self._max_voxels)
+
+    @property
+    def voxel_size(self):
+        """list[float]: Size of a single voxel."""
+        return self._voxel_size
+
+    @property
+    def max_num_points_per_voxel(self):
+        """int: Maximum number of points per voxel."""
+        return self._max_num_points
+
+    @property
+    def point_cloud_range(self):
+        """list[float]: Range of point cloud."""
+        return self._point_cloud_range
+
+    @property
+    def grid_size(self):
+        """np.ndarray: The size of grids."""
+        return self._grid_size
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        indent = ' ' * (len(repr_str) + 1)
+        repr_str += f'(voxel_size={self._voxel_size},\n'
+        repr_str += indent + 'point_cloud_range='
+        repr_str += f'{self._point_cloud_range.tolist()},\n'
+        repr_str += indent + f'max_num_points={self._max_num_points},\n'
+        repr_str += indent + f'max_voxels={self._max_voxels},\n'
+        repr_str += indent + f'grid_size={self._grid_size.tolist()}'
+        repr_str += ')'
+        return repr_str
+
+
+def points_to_voxel(points,
+                    voxel_size,
+                    coors_range,
+                    max_points=35,
+                    reverse_index=True,
+                    max_voxels=20000):
+    """convert kitti points(N, >=3) to voxels.
+
+    Args:
+        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size
+        coors_range (list[float | tuple[float] | ndarray]): Voxel range.
+            format: xyzxyz, minmax
+        max_points (int): Indicate maximum points contained in a voxel.
+        reverse_index (bool): Whether return reversed coordinates.
+            if points has xyz format and reverse_index is True, output
+            coordinates will be zyx format, but points in features always
+            xyz format.
+        max_voxels (int): Maximum number of voxels this function creates.
+            For second, 20000 is a good choice. Points should be shuffled for
+            randomness before this function because max_voxels drops points.
+
+    Returns:
+        tuple[np.ndarray]:
+            voxels: [M, max_points, ndim] float tensor. only contain points.
+            coordinates: [M, 3] int32 tensor.
+            num_points_per_voxel: [M] int32 tensor.
+    """
+    if not isinstance(voxel_size, np.ndarray):
+        voxel_size = np.array(voxel_size, dtype=points.dtype)
+    if not isinstance(coors_range, np.ndarray):
+        coors_range = np.array(coors_range, dtype=points.dtype)
+    voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
+    voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
+    if reverse_index:
+        voxelmap_shape = voxelmap_shape[::-1]
+    # don't create large array in jit(nopython=True) code.
+    num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
+    coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32)
+    voxels = np.zeros(
+        shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
+    coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
+    if reverse_index:
+        voxel_num = _points_to_voxel_reverse_kernel(
+            points, voxel_size, coors_range, num_points_per_voxel,
+            coor_to_voxelidx, voxels, coors, max_points, max_voxels)
+
+    else:
+        voxel_num = _points_to_voxel_kernel(points, voxel_size, coors_range,
+                                            num_points_per_voxel,
+                                            coor_to_voxelidx, voxels, coors,
+                                            max_points, max_voxels)
+
+    coors = coors[:voxel_num]
+    voxels = voxels[:voxel_num]
+    num_points_per_voxel = num_points_per_voxel[:voxel_num]
+
+    return voxels, coors, num_points_per_voxel
+
+
+@numba.jit(nopython=True)
+def _points_to_voxel_reverse_kernel(points,
+                                    voxel_size,
+                                    coors_range,
+                                    num_points_per_voxel,
+                                    coor_to_voxelidx,
+                                    voxels,
+                                    coors,
+                                    max_points=35,
+                                    max_voxels=20000):
+    """convert kitti points(N, >=3) to voxels.
+
+    Args:
+        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size
+        coors_range (list[float | tuple[float] | ndarray]): Range of voxels.
+            format: xyzxyz, minmax
+        num_points_per_voxel (int): Number of points per voxel.
+        coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W),
+            which has the same shape as the complete voxel map. It indicates
+            the index of each corresponding voxel.
+        voxels (np.ndarray): Created empty voxels.
+        coors (np.ndarray): Created coordinates of each voxel.
+        max_points (int): Indicate maximum points contained in a voxel.
+        max_voxels (int): Maximum number of voxels this function create.
+            for second, 20000 is a good choice. Points should be shuffled for
+            randomness before this function because max_voxels drops points.
+
+    Returns:
+        tuple[np.ndarray]:
+            voxels: Shape [M, max_points, ndim], only contain points.
+            coordinates: Shape [M, 3].
+            num_points_per_voxel: Shape [M].
+    """
+    # put all computations to one loop.
+    # we shouldn't create large array in main jit code, otherwise
+    # reduce performance
+    N = points.shape[0]
+    # ndim = points.shape[1] - 1
+    ndim = 3
+    ndim_minus_1 = ndim - 1
+    grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
+    # np.round(grid_size)
+    # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
+    grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
+    coor = np.zeros(shape=(3, ), dtype=np.int32)
+    voxel_num = 0
+    failed = False
+    for i in range(N):
+        failed = False
+        for j in range(ndim):
+            c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
+            if c < 0 or c >= grid_size[j]:
+                failed = True
+                break
+            coor[ndim_minus_1 - j] = c
+        if failed:
+            continue
+        voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
+        if voxelidx == -1:
+            voxelidx = voxel_num
+            if voxel_num >= max_voxels:
+                continue
+            voxel_num += 1
+            coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
+            coors[voxelidx] = coor
+        num = num_points_per_voxel[voxelidx]
+        if num < max_points:
+            voxels[voxelidx, num] = points[i]
+            num_points_per_voxel[voxelidx] += 1
+    return voxel_num
+
+
+@numba.jit(nopython=True)
+def _points_to_voxel_kernel(points,
+                            voxel_size,
+                            coors_range,
+                            num_points_per_voxel,
+                            coor_to_voxelidx,
+                            voxels,
+                            coors,
+                            max_points=35,
+                            max_voxels=20000):
+    """convert kitti points(N, >=3) to voxels.
+
+    Args:
+        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size.
+        coors_range (list[float | tuple[float] | ndarray]): Range of voxels.
+            format: xyzxyz, minmax
+        num_points_per_voxel (int): Number of points per voxel.
+        coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W),
+            which has the same shape as the complete voxel map. It indicates
+            the index of each corresponding voxel.
+        voxels (np.ndarray): Created empty voxels.
+        coors (np.ndarray): Created coordinates of each voxel.
+        max_points (int): Indicate maximum points contained in a voxel.
+        max_voxels (int): Maximum number of voxels this function create.
+            for second, 20000 is a good choice. Points should be shuffled for
+            randomness before this function because max_voxels drops points.
+
+    Returns:
+        tuple[np.ndarray]:
+            voxels: Shape [M, max_points, ndim], only contain points.
+            coordinates: Shape [M, 3].
+            num_points_per_voxel: Shape [M].
+    """
+    N = points.shape[0]
+    # ndim = points.shape[1] - 1
+    ndim = 3
+    grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
+    # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
+    grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
+
+    # lower_bound = coors_range[:3]
+    # upper_bound = coors_range[3:]
+    coor = np.zeros(shape=(3, ), dtype=np.int32)
+    voxel_num = 0
+    failed = False
+    for i in range(N):
+        failed = False
+        for j in range(ndim):
+            c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
+            if c < 0 or c >= grid_size[j]:
+                failed = True
+                break
+            coor[j] = c
+        if failed:
+            continue
+        voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
+        if voxelidx == -1:
+            voxelidx = voxel_num
+            if voxel_num >= max_voxels:
+                continue
+            voxel_num += 1
+            coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
+            coors[voxelidx] = coor
+        num = num_points_per_voxel[voxelidx]
+        if num < max_points:
+            voxels[voxelidx, num] = points[i]
+            num_points_per_voxel[voxelidx] += 1
+    return voxel_num
diff --git a/mmdet3d/datasets/__init__.py b/mmdet3d/datasets/__init__.py
index dab7502..4307ad6 100644
--- a/mmdet3d/datasets/__init__.py
+++ b/mmdet3d/datasets/__init__.py
@@ -1,47 +1,47 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.datasets.builder import build_dataloader
-from .builder import DATASETS, PIPELINES, build_dataset
-from .custom_3d import Custom3DDataset
-from .custom_3d_seg import Custom3DSegDataset
-from .kitti_dataset import KittiDataset
-from .kitti_mono_dataset import KittiMonoDataset
-from .lyft_dataset import LyftDataset
-from .nuscenes_dataset import NuScenesDataset
-from .nuscenes_mono_dataset import NuScenesMonoDataset
-# yapf: disable
-from .pipelines import (AffineResize, BackgroundPointsFilter, GlobalAlignment,
-                        GlobalRotScaleTrans, IndoorPatchPointSample,
-                        IndoorPointSample, LoadAnnotations3D,
-                        LoadPointsFromDict, LoadPointsFromFile,
-                        LoadPointsFromMultiSweeps, MultiViewWrapper,
-                        NormalizePointsColor, ObjectNameFilter, ObjectNoise,
-                        ObjectRangeFilter, ObjectSample, PointSample,
-                        PointShuffle, PointsRangeFilter, RandomDropPointsColor,
-                        RandomFlip3D, RandomJitterPoints, RandomRotate,
-                        RandomShiftScale, RangeLimitedRandomCrop,
-                        VoxelBasedPointSampler)
-# yapf: enable
-from .s3dis_dataset import S3DISDataset, S3DISSegDataset
-from .scannet_dataset import (ScanNetDataset, ScanNetInstanceSegDataset,
-                              ScanNetSegDataset)
-from .semantickitti_dataset import SemanticKITTIDataset
-from .sunrgbd_dataset import SUNRGBDDataset
-from .utils import get_loading_pipeline
-from .waymo_dataset import WaymoDataset
-
-__all__ = [
-    'KittiDataset', 'KittiMonoDataset', 'build_dataloader', 'DATASETS',
-    'build_dataset', 'NuScenesDataset', 'NuScenesMonoDataset', 'LyftDataset',
-    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
-    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter',
-    'LoadPointsFromFile', 'S3DISSegDataset', 'S3DISDataset',
-    'NormalizePointsColor', 'IndoorPatchPointSample', 'IndoorPointSample',
-    'PointSample', 'LoadAnnotations3D', 'GlobalAlignment', 'SUNRGBDDataset',
-    'ScanNetDataset', 'ScanNetSegDataset', 'ScanNetInstanceSegDataset',
-    'SemanticKITTIDataset', 'Custom3DDataset', 'Custom3DSegDataset',
-    'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter',
-    'VoxelBasedPointSampler', 'get_loading_pipeline', 'RandomDropPointsColor',
-    'RandomJitterPoints', 'ObjectNameFilter', 'AffineResize',
-    'RandomShiftScale', 'LoadPointsFromDict', 'PIPELINES',
-    'RangeLimitedRandomCrop', 'RandomRotate', 'MultiViewWrapper'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.datasets.builder import build_dataloader
+from .builder import DATASETS, PIPELINES, build_dataset
+from .custom_3d import Custom3DDataset
+from .custom_3d_seg import Custom3DSegDataset
+from .kitti_dataset import KittiDataset
+from .kitti_mono_dataset import KittiMonoDataset
+from .lyft_dataset import LyftDataset
+from .nuscenes_dataset import NuScenesDataset
+from .nuscenes_mono_dataset import NuScenesMonoDataset
+# yapf: disable
+from .pipelines import (AffineResize, BackgroundPointsFilter, GlobalAlignment,
+                        GlobalRotScaleTrans, IndoorPatchPointSample,
+                        IndoorPointSample, LoadAnnotations3D,
+                        LoadPointsFromDict, LoadPointsFromFile,
+                        LoadPointsFromMultiSweeps, MultiViewWrapper,
+                        NormalizePointsColor, ObjectNameFilter, ObjectNoise,
+                        ObjectRangeFilter, ObjectSample, PointSample,
+                        PointShuffle, PointsRangeFilter, RandomDropPointsColor,
+                        RandomFlip3D, RandomJitterPoints, RandomRotate,
+                        RandomShiftScale, RangeLimitedRandomCrop,
+                        VoxelBasedPointSampler)
+# yapf: enable
+from .s3dis_dataset import S3DISDataset, S3DISSegDataset
+from .scannet_dataset import (ScanNetDataset, ScanNetInstanceSegDataset,
+                              ScanNetSegDataset)
+from .semantickitti_dataset import SemanticKITTIDataset
+from .sunrgbd_dataset import SUNRGBDDataset
+from .utils import get_loading_pipeline
+from .waymo_dataset import WaymoDataset
+
+__all__ = [
+    'KittiDataset', 'KittiMonoDataset', 'build_dataloader', 'DATASETS',
+    'build_dataset', 'NuScenesDataset', 'NuScenesMonoDataset', 'LyftDataset',
+    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
+    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter',
+    'LoadPointsFromFile', 'S3DISSegDataset', 'S3DISDataset',
+    'NormalizePointsColor', 'IndoorPatchPointSample', 'IndoorPointSample',
+    'PointSample', 'LoadAnnotations3D', 'GlobalAlignment', 'SUNRGBDDataset',
+    'ScanNetDataset', 'ScanNetSegDataset', 'ScanNetInstanceSegDataset',
+    'SemanticKITTIDataset', 'Custom3DDataset', 'Custom3DSegDataset',
+    'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter',
+    'VoxelBasedPointSampler', 'get_loading_pipeline', 'RandomDropPointsColor',
+    'RandomJitterPoints', 'ObjectNameFilter', 'AffineResize',
+    'RandomShiftScale', 'LoadPointsFromDict', 'PIPELINES',
+    'RangeLimitedRandomCrop', 'RandomRotate', 'MultiViewWrapper'
+]
diff --git a/mmdet3d/datasets/builder.py b/mmdet3d/datasets/builder.py
index 157f640..a63dea5 100644
--- a/mmdet3d/datasets/builder.py
+++ b/mmdet3d/datasets/builder.py
@@ -1,47 +1,47 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import platform
-
-from mmcv.utils import Registry, build_from_cfg
-
-from mmdet.datasets import DATASETS as MMDET_DATASETS
-from mmdet.datasets.builder import _concat_dataset
-
-if platform.system() != 'Windows':
-    # https://github.com/pytorch/pytorch/issues/973
-    import resource
-    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
-    base_soft_limit = rlimit[0]
-    hard_limit = rlimit[1]
-    soft_limit = min(max(4096, base_soft_limit), hard_limit)
-    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
-
-OBJECTSAMPLERS = Registry('Object sampler')
-DATASETS = Registry('dataset')
-PIPELINES = Registry('pipeline')
-
-
-def build_dataset(cfg, default_args=None):
-    from mmdet3d.datasets.dataset_wrappers import CBGSDataset
-    from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
-                                                 ConcatDataset, RepeatDataset)
-    if isinstance(cfg, (list, tuple)):
-        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
-    elif cfg['type'] == 'ConcatDataset':
-        dataset = ConcatDataset(
-            [build_dataset(c, default_args) for c in cfg['datasets']],
-            cfg.get('separate_eval', True))
-    elif cfg['type'] == 'RepeatDataset':
-        dataset = RepeatDataset(
-            build_dataset(cfg['dataset'], default_args), cfg['times'])
-    elif cfg['type'] == 'ClassBalancedDataset':
-        dataset = ClassBalancedDataset(
-            build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
-    elif cfg['type'] == 'CBGSDataset':
-        dataset = CBGSDataset(build_dataset(cfg['dataset'], default_args))
-    elif isinstance(cfg.get('ann_file'), (list, tuple)):
-        dataset = _concat_dataset(cfg, default_args)
-    elif cfg['type'] in DATASETS._module_dict.keys():
-        dataset = build_from_cfg(cfg, DATASETS, default_args)
-    else:
-        dataset = build_from_cfg(cfg, MMDET_DATASETS, default_args)
-    return dataset
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+from mmcv.utils import Registry, build_from_cfg
+
+from mmdet.datasets import DATASETS as MMDET_DATASETS
+from mmdet.datasets.builder import _concat_dataset
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+OBJECTSAMPLERS = Registry('Object sampler')
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
+
+
+def build_dataset(cfg, default_args=None):
+    from mmdet3d.datasets.dataset_wrappers import CBGSDataset
+    from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
+                                                 ConcatDataset, RepeatDataset)
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'ConcatDataset':
+        dataset = ConcatDataset(
+            [build_dataset(c, default_args) for c in cfg['datasets']],
+            cfg.get('separate_eval', True))
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif cfg['type'] == 'ClassBalancedDataset':
+        dataset = ClassBalancedDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
+    elif cfg['type'] == 'CBGSDataset':
+        dataset = CBGSDataset(build_dataset(cfg['dataset'], default_args))
+    elif isinstance(cfg.get('ann_file'), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    elif cfg['type'] in DATASETS._module_dict.keys():
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+    else:
+        dataset = build_from_cfg(cfg, MMDET_DATASETS, default_args)
+    return dataset
diff --git a/mmdet3d/datasets/custom_3d.py b/mmdet3d/datasets/custom_3d.py
index 9c6e351..4cad7d6 100644
--- a/mmdet3d/datasets/custom_3d.py
+++ b/mmdet3d/datasets/custom_3d.py
@@ -1,448 +1,448 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import tempfile
-import warnings
-from os import path as osp
-
-import mmcv
-import numpy as np
-from torch.utils.data import Dataset
-
-from ..core.bbox import get_box_type
-from .builder import DATASETS
-from .pipelines import Compose
-from .utils import extract_result_dict, get_loading_pipeline
-
-
-@DATASETS.register_module()
-class Custom3DDataset(Dataset):
-    """Customized 3D dataset.
-
-    This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI
-    dataset.
-
-    .. code-block:: none
-
-    [
-        {'sample_idx':
-         'lidar_points': {'lidar_path': velodyne_path,
-                           ....
-                         },
-         'annos': {'box_type_3d':  (str)  'LiDAR/Camera/Depth'
-                   'gt_bboxes_3d':  <np.ndarray> (n, 7)
-                   'gt_names':  [list]
-                   ....
-               }
-         'calib': { .....}
-         'images': { .....}
-        }
-    ]
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR'. Available options includes
-
-            - 'LiDAR': Box in LiDAR coordinates.
-            - 'Depth': Box in depth coordinates, usually for indoor dataset.
-            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-    """
-
-    def __init__(self,
-                 data_root,
-                 ann_file,
-                 pipeline=None,
-                 classes=None,
-                 modality=None,
-                 box_type_3d='LiDAR',
-                 filter_empty_gt=True,
-                 test_mode=False,
-                 file_client_args=dict(backend='disk')):
-        super().__init__()
-        self.data_root = data_root
-        self.ann_file = ann_file
-        self.test_mode = test_mode
-        self.modality = modality
-        self.filter_empty_gt = filter_empty_gt
-        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
-
-        self.CLASSES = self.get_classes(classes)
-        self.file_client = mmcv.FileClient(**file_client_args)
-        self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
-
-        # load annotations
-        if hasattr(self.file_client, 'get_local_path'):
-            with self.file_client.get_local_path(self.ann_file) as local_path:
-                self.data_infos = self.load_annotations(open(local_path, 'rb'))
-        else:
-            warnings.warn(
-                'The used MMCV version does not have get_local_path. '
-                f'We treat the {self.ann_file} as local paths and it '
-                'might cause errors if the path is not a local path. '
-                'Please use MMCV>= 1.3.16 if you meet errors.')
-            self.data_infos = self.load_annotations(self.ann_file)
-
-        # process pipeline
-        if pipeline is not None:
-            self.pipeline = Compose(pipeline)
-
-        # set group flag for the samplers
-        if not self.test_mode:
-            self._set_group_flag()
-
-    def load_annotations(self, ann_file):
-        """Load annotations from ann_file.
-
-        Args:
-            ann_file (str): Path of the annotation file.
-
-        Returns:
-            list[dict]: List of annotations.
-        """
-        # loading data from a file-like object needs file format
-        return mmcv.load(ann_file, file_format='pkl')
-
-    def get_data_info(self, index):
-        """Get data info according to the given index.
-
-        Args:
-            index (int): Index of the sample data to get.
-
-        Returns:
-            dict: Data information that will be passed to the data
-                preprocessing pipelines. It includes the following keys:
-
-                - sample_idx (str): Sample index.
-                - pts_filename (str): Filename of point clouds.
-                - file_name (str): Filename of point clouds.
-                - ann_info (dict): Annotation info.
-        """
-        info = self.data_infos[index]
-        sample_idx = info['sample_idx']
-        pts_filename = osp.join(self.data_root,
-                                info['lidar_points']['lidar_path'])
-
-        input_dict = dict(
-            pts_filename=pts_filename,
-            sample_idx=sample_idx,
-            file_name=pts_filename)
-
-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-            if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
-                return None
-        return input_dict
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: Annotation information consists of the following keys:
-
-                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
-                    3D ground truth bboxes
-                - gt_labels_3d (np.ndarray): Labels of ground truths.
-                - gt_names (list[str]): Class names of ground truths.
-        """
-        info = self.data_infos[index]
-        gt_bboxes_3d = info['annos']['gt_bboxes_3d']
-        gt_names_3d = info['annos']['gt_names']
-        gt_labels_3d = []
-        for cat in gt_names_3d:
-            if cat in self.CLASSES:
-                gt_labels_3d.append(self.CLASSES.index(cat))
-            else:
-                gt_labels_3d.append(-1)
-        gt_labels_3d = np.array(gt_labels_3d)
-
-        # Obtain original box 3d type in info file
-        ori_box_type_3d = info['annos']['box_type_3d']
-        ori_box_type_3d, _ = get_box_type(ori_box_type_3d)
-
-        # turn original box type to target box type
-        gt_bboxes_3d = ori_box_type_3d(
-            gt_bboxes_3d,
-            box_dim=gt_bboxes_3d.shape[-1],
-            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
-
-        anns_results = dict(
-            gt_bboxes_3d=gt_bboxes_3d,
-            gt_labels_3d=gt_labels_3d,
-            gt_names=gt_names_3d)
-        return anns_results
-
-    def pre_pipeline(self, results):
-        """Initialization before data preparation.
-
-        Args:
-            results (dict): Dict before data preprocessing.
-
-                - img_fields (list): Image fields.
-                - bbox3d_fields (list): 3D bounding boxes fields.
-                - pts_mask_fields (list): Mask fields of points.
-                - pts_seg_fields (list): Mask fields of point segments.
-                - bbox_fields (list): Fields of bounding boxes.
-                - mask_fields (list): Fields of masks.
-                - seg_fields (list): Segment fields.
-                - box_type_3d (str): 3D box type.
-                - box_mode_3d (str): 3D box mode.
-        """
-        results['img_fields'] = []
-        results['bbox3d_fields'] = []
-        results['pts_mask_fields'] = []
-        results['pts_seg_fields'] = []
-        results['bbox_fields'] = []
-        results['mask_fields'] = []
-        results['seg_fields'] = []
-        results['box_type_3d'] = self.box_type_3d
-        results['box_mode_3d'] = self.box_mode_3d
-
-    def prepare_train_data(self, index):
-        """Training data preparation.
-
-        Args:
-            index (int): Index for accessing the target data.
-
-        Returns:
-            dict: Training data dict of the corresponding index.
-        """
-        input_dict = self.get_data_info(index)
-        if input_dict is None:
-            return None
-        self.pre_pipeline(input_dict)
-        example = self.pipeline(input_dict)
-        if self.filter_empty_gt and \
-                (example is None or
-                    ~(example['gt_labels_3d']._data != -1).any()):
-            return None
-        return example
-
-    def prepare_test_data(self, index):
-        """Prepare data for testing.
-
-        Args:
-            index (int): Index for accessing the target data.
-
-        Returns:
-            dict: Testing data dict of the corresponding index.
-        """
-        input_dict = self.get_data_info(index)
-        self.pre_pipeline(input_dict)
-        example = self.pipeline(input_dict)
-        return example
-
-    @classmethod
-    def get_classes(cls, classes=None):
-        """Get class names of current dataset.
-
-        Args:
-            classes (Sequence[str] | str): If classes is None, use
-                default CLASSES defined by builtin dataset. If classes is a
-                string, take it as a file name. The file contains the name of
-                classes where each line contains one class name. If classes is
-                a tuple or list, override the CLASSES defined by the dataset.
-
-        Return:
-            list[str]: A list of class names.
-        """
-        if classes is None:
-            return cls.CLASSES
-
-        if isinstance(classes, str):
-            # take it as a file path
-            class_names = mmcv.list_from_file(classes)
-        elif isinstance(classes, (tuple, list)):
-            class_names = classes
-        else:
-            raise ValueError(f'Unsupported type {type(classes)} of classes.')
-
-        return class_names
-
-    def format_results(self,
-                       outputs,
-                       pklfile_prefix=None,
-                       submission_prefix=None):
-        """Format the results to pkl file.
-
-        Args:
-            outputs (list[dict]): Testing results of the dataset.
-            pklfile_prefix (str): The prefix of pkl files. It includes
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-
-        Returns:
-            tuple: (outputs, tmp_dir), outputs is the detection results,
-                tmp_dir is the temporal directory created for saving json
-                files when ``jsonfile_prefix`` is not specified.
-        """
-        if pklfile_prefix is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            pklfile_prefix = osp.join(tmp_dir.name, 'results')
-            out = f'{pklfile_prefix}.pkl'
-        mmcv.dump(outputs, out)
-        return outputs, tmp_dir
-
-    def evaluate(self,
-                 results,
-                 metric=None,
-                 iou_thr=(0.25, 0.5),
-                 logger=None,
-                 show=False,
-                 out_dir=None,
-                 pipeline=None):
-        """Evaluate.
-
-        Evaluation in indoor protocol.
-
-        Args:
-            results (list[dict]): List of results.
-            metric (str | list[str], optional): Metrics to be evaluated.
-                Defaults to None.
-            iou_thr (list[float]): AP IoU thresholds. Defaults to (0.25, 0.5).
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Defaults to None.
-            show (bool, optional): Whether to visualize.
-                Default: False.
-            out_dir (str, optional): Path to save the visualization results.
-                Default: None.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-
-        Returns:
-            dict: Evaluation results.
-        """
-        from mmdet3d.core.evaluation import indoor_eval
-        assert isinstance(
-            results, list), f'Expect results to be list, got {type(results)}.'
-        assert len(results) > 0, 'Expect length of results > 0.'
-        assert len(results) == len(self.data_infos)
-        assert isinstance(
-            results[0], dict
-        ), f'Expect elements in results to be dict, got {type(results[0])}.'
-        gt_annos = [info['annos'] for info in self.data_infos]
-        label2cat = {i: cat_id for i, cat_id in enumerate(self.CLASSES)}
-        ret_dict = indoor_eval(
-            gt_annos,
-            results,
-            iou_thr,
-            label2cat,
-            logger=logger,
-            box_type_3d=self.box_type_3d,
-            box_mode_3d=self.box_mode_3d)
-        if show:
-            self.show(results, out_dir, pipeline=pipeline)
-
-        return ret_dict
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        raise NotImplementedError('_build_default_pipeline is not implemented '
-                                  f'for dataset {self.__class__.__name__}')
-
-    def _get_pipeline(self, pipeline):
-        """Get data loading pipeline in self.show/evaluate function.
-
-        Args:
-            pipeline (list[dict]): Input pipeline. If None is given,
-                get from self.pipeline.
-        """
-        if pipeline is None:
-            if not hasattr(self, 'pipeline') or self.pipeline is None:
-                warnings.warn(
-                    'Use default pipeline for data loading, this may cause '
-                    'errors when data is on ceph')
-                return self._build_default_pipeline()
-            loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
-            return Compose(loading_pipeline)
-        return Compose(pipeline)
-
-    def _extract_data(self, index, pipeline, key, load_annos=False):
-        """Load data using input pipeline and extract data according to key.
-
-        Args:
-            index (int): Index for accessing the target data.
-            pipeline (:obj:`Compose`): Composed data loading pipeline.
-            key (str | list[str]): One single or a list of data key.
-            load_annos (bool): Whether to load data annotations.
-                If True, need to set self.test_mode as False before loading.
-
-        Returns:
-            np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
-                A single or a list of loaded data.
-        """
-        assert pipeline is not None, 'data loading pipeline is not provided'
-        # when we want to load ground-truth via pipeline (e.g. bbox, seg mask)
-        # we need to set self.test_mode as False so that we have 'annos'
-        if load_annos:
-            original_test_mode = self.test_mode
-            self.test_mode = False
-        input_dict = self.get_data_info(index)
-        self.pre_pipeline(input_dict)
-        example = pipeline(input_dict)
-
-        # extract data items according to keys
-        if isinstance(key, str):
-            data = extract_result_dict(example, key)
-        else:
-            data = [extract_result_dict(example, k) for k in key]
-        if load_annos:
-            self.test_mode = original_test_mode
-
-        return data
-
-    def __len__(self):
-        """Return the length of data infos.
-
-        Returns:
-            int: Length of data infos.
-        """
-        return len(self.data_infos)
-
-    def _rand_another(self, idx):
-        """Randomly get another item with the same flag.
-
-        Returns:
-            int: Another index of item with the same flag.
-        """
-        pool = np.where(self.flag == self.flag[idx])[0]
-        return np.random.choice(pool)
-
-    def __getitem__(self, idx):
-        """Get item from infos according to the given index.
-
-        Returns:
-            dict: Data dictionary of the corresponding index.
-        """
-        if self.test_mode:
-            return self.prepare_test_data(idx)
-        while True:
-            data = self.prepare_train_data(idx)
-            if data is None:
-                idx = self._rand_another(idx)
-                continue
-            return data
-
-    def _set_group_flag(self):
-        """Set flag according to image aspect ratio.
-
-        Images with aspect ratio greater than 1 will be set as group 1,
-        otherwise group 0. In 3D datasets, they are all the same, thus are all
-        zeros.
-        """
-        self.flag = np.zeros(len(self), dtype=np.uint8)
+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+import warnings
+from os import path as osp
+
+import mmcv
+import numpy as np
+from torch.utils.data import Dataset
+
+from ..core.bbox import get_box_type
+from .builder import DATASETS
+from .pipelines import Compose
+from .utils import extract_result_dict, get_loading_pipeline
+
+
+@DATASETS.register_module()
+class Custom3DDataset(Dataset):
+    """Customized 3D dataset.
+
+    This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI
+    dataset.
+
+    .. code-block:: none
+
+    [
+        {'sample_idx':
+         'lidar_points': {'lidar_path': velodyne_path,
+                           ....
+                         },
+         'annos': {'box_type_3d':  (str)  'LiDAR/Camera/Depth'
+                   'gt_bboxes_3d':  <np.ndarray> (n, 7)
+                   'gt_names':  [list]
+                   ....
+               }
+         'calib': { .....}
+         'images': { .....}
+        }
+    ]
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR'. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 pipeline=None,
+                 classes=None,
+                 modality=None,
+                 box_type_3d='LiDAR',
+                 filter_empty_gt=True,
+                 test_mode=False,
+                 file_client_args=dict(backend='disk')):
+        super().__init__()
+        self.data_root = data_root
+        self.ann_file = ann_file
+        self.test_mode = test_mode
+        self.modality = modality
+        self.filter_empty_gt = filter_empty_gt
+        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+
+        self.CLASSES = self.get_classes(classes)
+        self.file_client = mmcv.FileClient(**file_client_args)
+        self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
+
+        # load annotations
+        if hasattr(self.file_client, 'get_local_path'):
+            with self.file_client.get_local_path(self.ann_file) as local_path:
+                self.data_infos = self.load_annotations(open(local_path, 'rb'))
+        else:
+            warnings.warn(
+                'The used MMCV version does not have get_local_path. '
+                f'We treat the {self.ann_file} as local paths and it '
+                'might cause errors if the path is not a local path. '
+                'Please use MMCV>= 1.3.16 if you meet errors.')
+            self.data_infos = self.load_annotations(self.ann_file)
+
+        # process pipeline
+        if pipeline is not None:
+            self.pipeline = Compose(pipeline)
+
+        # set group flag for the samplers
+        if not self.test_mode:
+            self._set_group_flag()
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations.
+        """
+        # loading data from a file-like object needs file format
+        return mmcv.load(ann_file, file_format='pkl')
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - file_name (str): Filename of point clouds.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        sample_idx = info['sample_idx']
+        pts_filename = osp.join(self.data_root,
+                                info['lidar_points']['lidar_path'])
+
+        input_dict = dict(
+            pts_filename=pts_filename,
+            sample_idx=sample_idx,
+            file_name=pts_filename)
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+            if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
+                return None
+        return input_dict
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+        """
+        info = self.data_infos[index]
+        gt_bboxes_3d = info['annos']['gt_bboxes_3d']
+        gt_names_3d = info['annos']['gt_names']
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        # Obtain original box 3d type in info file
+        ori_box_type_3d = info['annos']['box_type_3d']
+        ori_box_type_3d, _ = get_box_type(ori_box_type_3d)
+
+        # turn original box type to target box type
+        gt_bboxes_3d = ori_box_type_3d(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_names=gt_names_3d)
+        return anns_results
+
+    def pre_pipeline(self, results):
+        """Initialization before data preparation.
+
+        Args:
+            results (dict): Dict before data preprocessing.
+
+                - img_fields (list): Image fields.
+                - bbox3d_fields (list): 3D bounding boxes fields.
+                - pts_mask_fields (list): Mask fields of points.
+                - pts_seg_fields (list): Mask fields of point segments.
+                - bbox_fields (list): Fields of bounding boxes.
+                - mask_fields (list): Fields of masks.
+                - seg_fields (list): Segment fields.
+                - box_type_3d (str): 3D box type.
+                - box_mode_3d (str): 3D box mode.
+        """
+        results['img_fields'] = []
+        results['bbox3d_fields'] = []
+        results['pts_mask_fields'] = []
+        results['pts_seg_fields'] = []
+        results['bbox_fields'] = []
+        results['mask_fields'] = []
+        results['seg_fields'] = []
+        results['box_type_3d'] = self.box_type_3d
+        results['box_mode_3d'] = self.box_mode_3d
+
+    def prepare_train_data(self, index):
+        """Training data preparation.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        if input_dict is None:
+            return None
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        if self.filter_empty_gt and \
+                (example is None or
+                    ~(example['gt_labels_3d']._data != -1).any()):
+            return None
+        return example
+
+    def prepare_test_data(self, index):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Testing data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        return example
+
+    @classmethod
+    def get_classes(cls, classes=None):
+        """Get class names of current dataset.
+
+        Args:
+            classes (Sequence[str] | str): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+
+        Return:
+            list[str]: A list of class names.
+        """
+        if classes is None:
+            return cls.CLASSES
+
+        if isinstance(classes, str):
+            # take it as a file path
+            class_names = mmcv.list_from_file(classes)
+        elif isinstance(classes, (tuple, list)):
+            class_names = classes
+        else:
+            raise ValueError(f'Unsupported type {type(classes)} of classes.')
+
+        return class_names
+
+    def format_results(self,
+                       outputs,
+                       pklfile_prefix=None,
+                       submission_prefix=None):
+        """Format the results to pkl file.
+
+        Args:
+            outputs (list[dict]): Testing results of the dataset.
+            pklfile_prefix (str): The prefix of pkl files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (outputs, tmp_dir), outputs is the detection results,
+                tmp_dir is the temporal directory created for saving json
+                files when ``jsonfile_prefix`` is not specified.
+        """
+        if pklfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            pklfile_prefix = osp.join(tmp_dir.name, 'results')
+            out = f'{pklfile_prefix}.pkl'
+        mmcv.dump(outputs, out)
+        return outputs, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric=None,
+                 iou_thr=(0.25, 0.5),
+                 logger=None,
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluate.
+
+        Evaluation in indoor protocol.
+
+        Args:
+            results (list[dict]): List of results.
+            metric (str | list[str], optional): Metrics to be evaluated.
+                Defaults to None.
+            iou_thr (list[float]): AP IoU thresholds. Defaults to (0.25, 0.5).
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Defaults to None.
+            show (bool, optional): Whether to visualize.
+                Default: False.
+            out_dir (str, optional): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict: Evaluation results.
+        """
+        from mmdet3d.core.evaluation import indoor_eval
+        assert isinstance(
+            results, list), f'Expect results to be list, got {type(results)}.'
+        assert len(results) > 0, 'Expect length of results > 0.'
+        assert len(results) == len(self.data_infos)
+        assert isinstance(
+            results[0], dict
+        ), f'Expect elements in results to be dict, got {type(results[0])}.'
+        gt_annos = [info['annos'] for info in self.data_infos]
+        label2cat = {i: cat_id for i, cat_id in enumerate(self.CLASSES)}
+        ret_dict = indoor_eval(
+            gt_annos,
+            results,
+            iou_thr,
+            label2cat,
+            logger=logger,
+            box_type_3d=self.box_type_3d,
+            box_mode_3d=self.box_mode_3d)
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+
+        return ret_dict
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        raise NotImplementedError('_build_default_pipeline is not implemented '
+                                  f'for dataset {self.__class__.__name__}')
+
+    def _get_pipeline(self, pipeline):
+        """Get data loading pipeline in self.show/evaluate function.
+
+        Args:
+            pipeline (list[dict]): Input pipeline. If None is given,
+                get from self.pipeline.
+        """
+        if pipeline is None:
+            if not hasattr(self, 'pipeline') or self.pipeline is None:
+                warnings.warn(
+                    'Use default pipeline for data loading, this may cause '
+                    'errors when data is on ceph')
+                return self._build_default_pipeline()
+            loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
+            return Compose(loading_pipeline)
+        return Compose(pipeline)
+
+    def _extract_data(self, index, pipeline, key, load_annos=False):
+        """Load data using input pipeline and extract data according to key.
+
+        Args:
+            index (int): Index for accessing the target data.
+            pipeline (:obj:`Compose`): Composed data loading pipeline.
+            key (str | list[str]): One single or a list of data key.
+            load_annos (bool): Whether to load data annotations.
+                If True, need to set self.test_mode as False before loading.
+
+        Returns:
+            np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
+                A single or a list of loaded data.
+        """
+        assert pipeline is not None, 'data loading pipeline is not provided'
+        # when we want to load ground-truth via pipeline (e.g. bbox, seg mask)
+        # we need to set self.test_mode as False so that we have 'annos'
+        if load_annos:
+            original_test_mode = self.test_mode
+            self.test_mode = False
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = pipeline(input_dict)
+
+        # extract data items according to keys
+        if isinstance(key, str):
+            data = extract_result_dict(example, key)
+        else:
+            data = [extract_result_dict(example, k) for k in key]
+        if load_annos:
+            self.test_mode = original_test_mode
+
+        return data
+
+    def __len__(self):
+        """Return the length of data infos.
+
+        Returns:
+            int: Length of data infos.
+        """
+        return len(self.data_infos)
+
+    def _rand_another(self, idx):
+        """Randomly get another item with the same flag.
+
+        Returns:
+            int: Another index of item with the same flag.
+        """
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0. In 3D datasets, they are all the same, thus are all
+        zeros.
+        """
+        self.flag = np.zeros(len(self), dtype=np.uint8)
diff --git a/mmdet3d/datasets/custom_3d_seg.py b/mmdet3d/datasets/custom_3d_seg.py
index e123611..3ff6e0a 100644
--- a/mmdet3d/datasets/custom_3d_seg.py
+++ b/mmdet3d/datasets/custom_3d_seg.py
@@ -1,465 +1,465 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import tempfile
-import warnings
-from os import path as osp
-
-import mmcv
-import numpy as np
-from torch.utils.data import Dataset
-
-from mmseg.datasets import DATASETS as SEG_DATASETS
-from .builder import DATASETS
-from .pipelines import Compose
-from .utils import extract_result_dict, get_loading_pipeline
-
-
-@DATASETS.register_module()
-@SEG_DATASETS.register_module()
-class Custom3DSegDataset(Dataset):
-    """Customized 3D dataset for semantic segmentation task.
-
-    This is the base dataset of ScanNet and S3DIS dataset.
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        palette (list[list[int]], optional): The palette of segmentation map.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-        ignore_index (int, optional): The label index to be ignored, e.g.
-            unannotated points. If None is given, set to len(self.CLASSES) to
-            be consistent with PointSegClassMapping function in pipeline.
-            Defaults to None.
-        scene_idxs (np.ndarray | str, optional): Precomputed index to load
-            data. For scenes with many points, we may sample it several times.
-            Defaults to None.
-    """
-    # names of all classes data used for the task
-    CLASSES = None
-
-    # class_ids used for training
-    VALID_CLASS_IDS = None
-
-    # all possible class_ids in loaded segmentation mask
-    ALL_CLASS_IDS = None
-
-    # official color for visualization
-    PALETTE = None
-
-    def __init__(self,
-                 data_root,
-                 ann_file,
-                 pipeline=None,
-                 classes=None,
-                 palette=None,
-                 modality=None,
-                 test_mode=False,
-                 ignore_index=None,
-                 scene_idxs=None,
-                 file_client_args=dict(backend='disk')):
-        super().__init__()
-        self.data_root = data_root
-        self.ann_file = ann_file
-        self.test_mode = test_mode
-        self.modality = modality
-        self.file_client = mmcv.FileClient(**file_client_args)
-
-        # load annotations
-        if hasattr(self.file_client, 'get_local_path'):
-            with self.file_client.get_local_path(self.ann_file) as local_path:
-                self.data_infos = self.load_annotations(open(local_path, 'rb'))
-        else:
-            warnings.warn(
-                'The used MMCV version does not have get_local_path. '
-                f'We treat the {self.ann_file} as local paths and it '
-                'might cause errors if the path is not a local path. '
-                'Please use MMCV>= 1.3.16 if you meet errors.')
-            self.data_infos = self.load_annotations(self.ann_file)
-
-        if pipeline is not None:
-            self.pipeline = Compose(pipeline)
-
-        self.ignore_index = len(self.CLASSES) if \
-            ignore_index is None else ignore_index
-
-        self.scene_idxs = self.get_scene_idxs(scene_idxs)
-        self.CLASSES, self.PALETTE = \
-            self.get_classes_and_palette(classes, palette)
-
-        # set group flag for the sampler
-        if not self.test_mode:
-            self._set_group_flag()
-
-    def load_annotations(self, ann_file):
-        """Load annotations from ann_file.
-
-        Args:
-            ann_file (str): Path of the annotation file.
-
-        Returns:
-            list[dict]: List of annotations.
-        """
-        # loading data from a file-like object needs file format
-        return mmcv.load(ann_file, file_format='pkl')
-
-    def get_data_info(self, index):
-        """Get data info according to the given index.
-
-        Args:
-            index (int): Index of the sample data to get.
-
-        Returns:
-            dict: Data information that will be passed to the data
-                preprocessing pipelines. It includes the following keys:
-
-                - sample_idx (str): Sample index.
-                - pts_filename (str): Filename of point clouds.
-                - file_name (str): Filename of point clouds.
-                - ann_info (dict): Annotation info.
-        """
-        info = self.data_infos[index]
-        sample_idx = info['point_cloud']['lidar_idx']
-        pts_filename = osp.join(self.data_root, info['pts_path'])
-
-        input_dict = dict(
-            pts_filename=pts_filename,
-            sample_idx=sample_idx,
-            file_name=pts_filename)
-
-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-        return input_dict
-
-    def pre_pipeline(self, results):
-        """Initialization before data preparation.
-
-        Args:
-            results (dict): Dict before data preprocessing.
-
-                - img_fields (list): Image fields.
-                - pts_mask_fields (list): Mask fields of points.
-                - pts_seg_fields (list): Mask fields of point segments.
-                - mask_fields (list): Fields of masks.
-                - seg_fields (list): Segment fields.
-        """
-        results['img_fields'] = []
-        results['pts_mask_fields'] = []
-        results['pts_seg_fields'] = []
-        results['mask_fields'] = []
-        results['seg_fields'] = []
-        results['bbox3d_fields'] = []
-
-    def prepare_train_data(self, index):
-        """Training data preparation.
-
-        Args:
-            index (int): Index for accessing the target data.
-
-        Returns:
-            dict: Training data dict of the corresponding index.
-        """
-        input_dict = self.get_data_info(index)
-        if input_dict is None:
-            return None
-        self.pre_pipeline(input_dict)
-        example = self.pipeline(input_dict)
-        return example
-
-    def prepare_test_data(self, index):
-        """Prepare data for testing.
-
-        Args:
-            index (int): Index for accessing the target data.
-
-        Returns:
-            dict: Testing data dict of the corresponding index.
-        """
-        input_dict = self.get_data_info(index)
-        self.pre_pipeline(input_dict)
-        example = self.pipeline(input_dict)
-        return example
-
-    def get_classes_and_palette(self, classes=None, palette=None):
-        """Get class names of current dataset.
-
-        This function is taken from MMSegmentation.
-
-        Args:
-            classes (Sequence[str] | str): If classes is None, use
-                default CLASSES defined by builtin dataset. If classes is a
-                string, take it as a file name. The file contains the name of
-                classes where each line contains one class name. If classes is
-                a tuple or list, override the CLASSES defined by the dataset.
-                Defaults to None.
-            palette (Sequence[Sequence[int]]] | np.ndarray):
-                The palette of segmentation map. If None is given, random
-                palette will be generated. Defaults to None.
-        """
-        if classes is None:
-            self.custom_classes = False
-            # map id in the loaded mask to label used for training
-            self.label_map = {
-                cls_id: self.ignore_index
-                for cls_id in self.ALL_CLASS_IDS
-            }
-            self.label_map.update(
-                {cls_id: i
-                 for i, cls_id in enumerate(self.VALID_CLASS_IDS)})
-            # map label to category name
-            self.label2cat = {
-                i: cat_name
-                for i, cat_name in enumerate(self.CLASSES)
-            }
-            return self.CLASSES, self.PALETTE
-
-        self.custom_classes = True
-        if isinstance(classes, str):
-            # take it as a file path
-            class_names = mmcv.list_from_file(classes)
-        elif isinstance(classes, (tuple, list)):
-            class_names = classes
-        else:
-            raise ValueError(f'Unsupported type {type(classes)} of classes.')
-
-        if self.CLASSES:
-            if not set(class_names).issubset(self.CLASSES):
-                raise ValueError('classes is not a subset of CLASSES.')
-
-            # update valid_class_ids
-            self.VALID_CLASS_IDS = [
-                self.VALID_CLASS_IDS[self.CLASSES.index(cls_name)]
-                for cls_name in class_names
-            ]
-
-            # dictionary, its keys are the old label ids and its values
-            # are the new label ids.
-            # used for changing pixel labels in load_annotations.
-            self.label_map = {
-                cls_id: self.ignore_index
-                for cls_id in self.ALL_CLASS_IDS
-            }
-            self.label_map.update(
-                {cls_id: i
-                 for i, cls_id in enumerate(self.VALID_CLASS_IDS)})
-            self.label2cat = {
-                i: cat_name
-                for i, cat_name in enumerate(class_names)
-            }
-
-        # modify palette for visualization
-        palette = [
-            self.PALETTE[self.CLASSES.index(cls_name)]
-            for cls_name in class_names
-        ]
-
-        return class_names, palette
-
-    def get_scene_idxs(self, scene_idxs):
-        """Compute scene_idxs for data sampling.
-
-        We sample more times for scenes with more points.
-        """
-        if self.test_mode:
-            # when testing, we load one whole scene every time
-            return np.arange(len(self.data_infos)).astype(np.int32)
-
-        # we may need to re-sample different scenes according to scene_idxs
-        # this is necessary for indoor scene segmentation such as ScanNet
-        if scene_idxs is None:
-            scene_idxs = np.arange(len(self.data_infos))
-        if isinstance(scene_idxs, str):
-            with self.file_client.get_local_path(scene_idxs) as local_path:
-                scene_idxs = np.load(local_path)
-        else:
-            scene_idxs = np.array(scene_idxs)
-
-        return scene_idxs.astype(np.int32)
-
-    def format_results(self,
-                       outputs,
-                       pklfile_prefix=None,
-                       submission_prefix=None):
-        """Format the results to pkl file.
-
-        Args:
-            outputs (list[dict]): Testing results of the dataset.
-            pklfile_prefix (str): The prefix of pkl files. It includes
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-
-        Returns:
-            tuple: (outputs, tmp_dir), outputs is the detection results,
-                tmp_dir is the temporal directory created for saving json
-                files when ``jsonfile_prefix`` is not specified.
-        """
-        if pklfile_prefix is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            pklfile_prefix = osp.join(tmp_dir.name, 'results')
-            out = f'{pklfile_prefix}.pkl'
-        mmcv.dump(outputs, out)
-        return outputs, tmp_dir
-
-    def evaluate(self,
-                 results,
-                 metric=None,
-                 logger=None,
-                 show=False,
-                 out_dir=None,
-                 pipeline=None):
-        """Evaluate.
-
-        Evaluation in semantic segmentation protocol.
-
-        Args:
-            results (list[dict]): List of results.
-            metric (str | list[str]): Metrics to be evaluated.
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Defaults to None.
-            show (bool, optional): Whether to visualize.
-                Defaults to False.
-            out_dir (str, optional): Path to save the visualization results.
-                Defaults to None.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-
-        Returns:
-            dict: Evaluation results.
-        """
-        from mmdet3d.core.evaluation import seg_eval
-        assert isinstance(
-            results, list), f'Expect results to be list, got {type(results)}.'
-        assert len(results) > 0, 'Expect length of results > 0.'
-        assert len(results) == len(self.data_infos)
-        assert isinstance(
-            results[0], dict
-        ), f'Expect elements in results to be dict, got {type(results[0])}.'
-
-        load_pipeline = self._get_pipeline(pipeline)
-        pred_sem_masks = [result['semantic_mask'] for result in results]
-        gt_sem_masks = [
-            self._extract_data(
-                i, load_pipeline, 'pts_semantic_mask', load_annos=True)
-            for i in range(len(self.data_infos))
-        ]
-        ret_dict = seg_eval(
-            gt_sem_masks,
-            pred_sem_masks,
-            self.label2cat,
-            self.ignore_index,
-            logger=logger)
-
-        if show:
-            self.show(pred_sem_masks, out_dir, pipeline=pipeline)
-
-        return ret_dict
-
-    def _rand_another(self, idx):
-        """Randomly get another item with the same flag.
-
-        Returns:
-            int: Another index of item with the same flag.
-        """
-        pool = np.where(self.flag == self.flag[idx])[0]
-        return np.random.choice(pool)
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        raise NotImplementedError('_build_default_pipeline is not implemented '
-                                  f'for dataset {self.__class__.__name__}')
-
-    def _get_pipeline(self, pipeline):
-        """Get data loading pipeline in self.show/evaluate function.
-
-        Args:
-            pipeline (list[dict]): Input pipeline. If None is given,
-                get from self.pipeline.
-        """
-        if pipeline is None:
-            if not hasattr(self, 'pipeline') or self.pipeline is None:
-                warnings.warn(
-                    'Use default pipeline for data loading, this may cause '
-                    'errors when data is on ceph')
-                return self._build_default_pipeline()
-            loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
-            return Compose(loading_pipeline)
-        return Compose(pipeline)
-
-    def _extract_data(self, index, pipeline, key, load_annos=False):
-        """Load data using input pipeline and extract data according to key.
-
-        Args:
-            index (int): Index for accessing the target data.
-            pipeline (:obj:`Compose`): Composed data loading pipeline.
-            key (str | list[str]): One single or a list of data key.
-            load_annos (bool): Whether to load data annotations.
-                If True, need to set self.test_mode as False before loading.
-
-        Returns:
-            np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
-                A single or a list of loaded data.
-        """
-        assert pipeline is not None, 'data loading pipeline is not provided'
-        # when we want to load ground-truth via pipeline (e.g. bbox, seg mask)
-        # we need to set self.test_mode as False so that we have 'annos'
-        if load_annos:
-            original_test_mode = self.test_mode
-            self.test_mode = False
-        input_dict = self.get_data_info(index)
-        self.pre_pipeline(input_dict)
-        example = pipeline(input_dict)
-
-        # extract data items according to keys
-        if isinstance(key, str):
-            data = extract_result_dict(example, key)
-        else:
-            data = [extract_result_dict(example, k) for k in key]
-        if load_annos:
-            self.test_mode = original_test_mode
-
-        return data
-
-    def __len__(self):
-        """Return the length of scene_idxs.
-
-        Returns:
-            int: Length of data infos.
-        """
-        return len(self.scene_idxs)
-
-    def __getitem__(self, idx):
-        """Get item from infos according to the given index.
-
-        In indoor scene segmentation task, each scene contains millions of
-        points. However, we only sample less than 10k points within a patch
-        each time. Therefore, we use `scene_idxs` to re-sample different rooms.
-
-        Returns:
-            dict: Data dictionary of the corresponding index.
-        """
-        scene_idx = self.scene_idxs[idx]  # map to scene idx
-        if self.test_mode:
-            return self.prepare_test_data(scene_idx)
-        while True:
-            data = self.prepare_train_data(scene_idx)
-            if data is None:
-                idx = self._rand_another(idx)
-                scene_idx = self.scene_idxs[idx]  # map to scene idx
-                continue
-            return data
-
-    def _set_group_flag(self):
-        """Set flag according to image aspect ratio.
-
-        Images with aspect ratio greater than 1 will be set as group 1,
-        otherwise group 0. In 3D datasets, they are all the same, thus are all
-        zeros.
-        """
-        self.flag = np.zeros(len(self), dtype=np.uint8)
+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+import warnings
+from os import path as osp
+
+import mmcv
+import numpy as np
+from torch.utils.data import Dataset
+
+from mmseg.datasets import DATASETS as SEG_DATASETS
+from .builder import DATASETS
+from .pipelines import Compose
+from .utils import extract_result_dict, get_loading_pipeline
+
+
+@DATASETS.register_module()
+@SEG_DATASETS.register_module()
+class Custom3DSegDataset(Dataset):
+    """Customized 3D dataset for semantic segmentation task.
+
+    This is the base dataset of ScanNet and S3DIS dataset.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        palette (list[list[int]], optional): The palette of segmentation map.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.CLASSES) to
+            be consistent with PointSegClassMapping function in pipeline.
+            Defaults to None.
+        scene_idxs (np.ndarray | str, optional): Precomputed index to load
+            data. For scenes with many points, we may sample it several times.
+            Defaults to None.
+    """
+    # names of all classes data used for the task
+    CLASSES = None
+
+    # class_ids used for training
+    VALID_CLASS_IDS = None
+
+    # all possible class_ids in loaded segmentation mask
+    ALL_CLASS_IDS = None
+
+    # official color for visualization
+    PALETTE = None
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 pipeline=None,
+                 classes=None,
+                 palette=None,
+                 modality=None,
+                 test_mode=False,
+                 ignore_index=None,
+                 scene_idxs=None,
+                 file_client_args=dict(backend='disk')):
+        super().__init__()
+        self.data_root = data_root
+        self.ann_file = ann_file
+        self.test_mode = test_mode
+        self.modality = modality
+        self.file_client = mmcv.FileClient(**file_client_args)
+
+        # load annotations
+        if hasattr(self.file_client, 'get_local_path'):
+            with self.file_client.get_local_path(self.ann_file) as local_path:
+                self.data_infos = self.load_annotations(open(local_path, 'rb'))
+        else:
+            warnings.warn(
+                'The used MMCV version does not have get_local_path. '
+                f'We treat the {self.ann_file} as local paths and it '
+                'might cause errors if the path is not a local path. '
+                'Please use MMCV>= 1.3.16 if you meet errors.')
+            self.data_infos = self.load_annotations(self.ann_file)
+
+        if pipeline is not None:
+            self.pipeline = Compose(pipeline)
+
+        self.ignore_index = len(self.CLASSES) if \
+            ignore_index is None else ignore_index
+
+        self.scene_idxs = self.get_scene_idxs(scene_idxs)
+        self.CLASSES, self.PALETTE = \
+            self.get_classes_and_palette(classes, palette)
+
+        # set group flag for the sampler
+        if not self.test_mode:
+            self._set_group_flag()
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations.
+        """
+        # loading data from a file-like object needs file format
+        return mmcv.load(ann_file, file_format='pkl')
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - file_name (str): Filename of point clouds.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        sample_idx = info['point_cloud']['lidar_idx']
+        pts_filename = osp.join(self.data_root, info['pts_path'])
+
+        input_dict = dict(
+            pts_filename=pts_filename,
+            sample_idx=sample_idx,
+            file_name=pts_filename)
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+        return input_dict
+
+    def pre_pipeline(self, results):
+        """Initialization before data preparation.
+
+        Args:
+            results (dict): Dict before data preprocessing.
+
+                - img_fields (list): Image fields.
+                - pts_mask_fields (list): Mask fields of points.
+                - pts_seg_fields (list): Mask fields of point segments.
+                - mask_fields (list): Fields of masks.
+                - seg_fields (list): Segment fields.
+        """
+        results['img_fields'] = []
+        results['pts_mask_fields'] = []
+        results['pts_seg_fields'] = []
+        results['mask_fields'] = []
+        results['seg_fields'] = []
+        results['bbox3d_fields'] = []
+
+    def prepare_train_data(self, index):
+        """Training data preparation.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        if input_dict is None:
+            return None
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        return example
+
+    def prepare_test_data(self, index):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Testing data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        return example
+
+    def get_classes_and_palette(self, classes=None, palette=None):
+        """Get class names of current dataset.
+
+        This function is taken from MMSegmentation.
+
+        Args:
+            classes (Sequence[str] | str): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+                Defaults to None.
+            palette (Sequence[Sequence[int]]] | np.ndarray):
+                The palette of segmentation map. If None is given, random
+                palette will be generated. Defaults to None.
+        """
+        if classes is None:
+            self.custom_classes = False
+            # map id in the loaded mask to label used for training
+            self.label_map = {
+                cls_id: self.ignore_index
+                for cls_id in self.ALL_CLASS_IDS
+            }
+            self.label_map.update(
+                {cls_id: i
+                 for i, cls_id in enumerate(self.VALID_CLASS_IDS)})
+            # map label to category name
+            self.label2cat = {
+                i: cat_name
+                for i, cat_name in enumerate(self.CLASSES)
+            }
+            return self.CLASSES, self.PALETTE
+
+        self.custom_classes = True
+        if isinstance(classes, str):
+            # take it as a file path
+            class_names = mmcv.list_from_file(classes)
+        elif isinstance(classes, (tuple, list)):
+            class_names = classes
+        else:
+            raise ValueError(f'Unsupported type {type(classes)} of classes.')
+
+        if self.CLASSES:
+            if not set(class_names).issubset(self.CLASSES):
+                raise ValueError('classes is not a subset of CLASSES.')
+
+            # update valid_class_ids
+            self.VALID_CLASS_IDS = [
+                self.VALID_CLASS_IDS[self.CLASSES.index(cls_name)]
+                for cls_name in class_names
+            ]
+
+            # dictionary, its keys are the old label ids and its values
+            # are the new label ids.
+            # used for changing pixel labels in load_annotations.
+            self.label_map = {
+                cls_id: self.ignore_index
+                for cls_id in self.ALL_CLASS_IDS
+            }
+            self.label_map.update(
+                {cls_id: i
+                 for i, cls_id in enumerate(self.VALID_CLASS_IDS)})
+            self.label2cat = {
+                i: cat_name
+                for i, cat_name in enumerate(class_names)
+            }
+
+        # modify palette for visualization
+        palette = [
+            self.PALETTE[self.CLASSES.index(cls_name)]
+            for cls_name in class_names
+        ]
+
+        return class_names, palette
+
+    def get_scene_idxs(self, scene_idxs):
+        """Compute scene_idxs for data sampling.
+
+        We sample more times for scenes with more points.
+        """
+        if self.test_mode:
+            # when testing, we load one whole scene every time
+            return np.arange(len(self.data_infos)).astype(np.int32)
+
+        # we may need to re-sample different scenes according to scene_idxs
+        # this is necessary for indoor scene segmentation such as ScanNet
+        if scene_idxs is None:
+            scene_idxs = np.arange(len(self.data_infos))
+        if isinstance(scene_idxs, str):
+            with self.file_client.get_local_path(scene_idxs) as local_path:
+                scene_idxs = np.load(local_path)
+        else:
+            scene_idxs = np.array(scene_idxs)
+
+        return scene_idxs.astype(np.int32)
+
+    def format_results(self,
+                       outputs,
+                       pklfile_prefix=None,
+                       submission_prefix=None):
+        """Format the results to pkl file.
+
+        Args:
+            outputs (list[dict]): Testing results of the dataset.
+            pklfile_prefix (str): The prefix of pkl files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (outputs, tmp_dir), outputs is the detection results,
+                tmp_dir is the temporal directory created for saving json
+                files when ``jsonfile_prefix`` is not specified.
+        """
+        if pklfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            pklfile_prefix = osp.join(tmp_dir.name, 'results')
+            out = f'{pklfile_prefix}.pkl'
+        mmcv.dump(outputs, out)
+        return outputs, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric=None,
+                 logger=None,
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluate.
+
+        Evaluation in semantic segmentation protocol.
+
+        Args:
+            results (list[dict]): List of results.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Defaults to None.
+            show (bool, optional): Whether to visualize.
+                Defaults to False.
+            out_dir (str, optional): Path to save the visualization results.
+                Defaults to None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict: Evaluation results.
+        """
+        from mmdet3d.core.evaluation import seg_eval
+        assert isinstance(
+            results, list), f'Expect results to be list, got {type(results)}.'
+        assert len(results) > 0, 'Expect length of results > 0.'
+        assert len(results) == len(self.data_infos)
+        assert isinstance(
+            results[0], dict
+        ), f'Expect elements in results to be dict, got {type(results[0])}.'
+
+        load_pipeline = self._get_pipeline(pipeline)
+        pred_sem_masks = [result['semantic_mask'] for result in results]
+        gt_sem_masks = [
+            self._extract_data(
+                i, load_pipeline, 'pts_semantic_mask', load_annos=True)
+            for i in range(len(self.data_infos))
+        ]
+        ret_dict = seg_eval(
+            gt_sem_masks,
+            pred_sem_masks,
+            self.label2cat,
+            self.ignore_index,
+            logger=logger)
+
+        if show:
+            self.show(pred_sem_masks, out_dir, pipeline=pipeline)
+
+        return ret_dict
+
+    def _rand_another(self, idx):
+        """Randomly get another item with the same flag.
+
+        Returns:
+            int: Another index of item with the same flag.
+        """
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        raise NotImplementedError('_build_default_pipeline is not implemented '
+                                  f'for dataset {self.__class__.__name__}')
+
+    def _get_pipeline(self, pipeline):
+        """Get data loading pipeline in self.show/evaluate function.
+
+        Args:
+            pipeline (list[dict]): Input pipeline. If None is given,
+                get from self.pipeline.
+        """
+        if pipeline is None:
+            if not hasattr(self, 'pipeline') or self.pipeline is None:
+                warnings.warn(
+                    'Use default pipeline for data loading, this may cause '
+                    'errors when data is on ceph')
+                return self._build_default_pipeline()
+            loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
+            return Compose(loading_pipeline)
+        return Compose(pipeline)
+
+    def _extract_data(self, index, pipeline, key, load_annos=False):
+        """Load data using input pipeline and extract data according to key.
+
+        Args:
+            index (int): Index for accessing the target data.
+            pipeline (:obj:`Compose`): Composed data loading pipeline.
+            key (str | list[str]): One single or a list of data key.
+            load_annos (bool): Whether to load data annotations.
+                If True, need to set self.test_mode as False before loading.
+
+        Returns:
+            np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
+                A single or a list of loaded data.
+        """
+        assert pipeline is not None, 'data loading pipeline is not provided'
+        # when we want to load ground-truth via pipeline (e.g. bbox, seg mask)
+        # we need to set self.test_mode as False so that we have 'annos'
+        if load_annos:
+            original_test_mode = self.test_mode
+            self.test_mode = False
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = pipeline(input_dict)
+
+        # extract data items according to keys
+        if isinstance(key, str):
+            data = extract_result_dict(example, key)
+        else:
+            data = [extract_result_dict(example, k) for k in key]
+        if load_annos:
+            self.test_mode = original_test_mode
+
+        return data
+
+    def __len__(self):
+        """Return the length of scene_idxs.
+
+        Returns:
+            int: Length of data infos.
+        """
+        return len(self.scene_idxs)
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+
+        In indoor scene segmentation task, each scene contains millions of
+        points. However, we only sample less than 10k points within a patch
+        each time. Therefore, we use `scene_idxs` to re-sample different rooms.
+
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        scene_idx = self.scene_idxs[idx]  # map to scene idx
+        if self.test_mode:
+            return self.prepare_test_data(scene_idx)
+        while True:
+            data = self.prepare_train_data(scene_idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                scene_idx = self.scene_idxs[idx]  # map to scene idx
+                continue
+            return data
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0. In 3D datasets, they are all the same, thus are all
+        zeros.
+        """
+        self.flag = np.zeros(len(self), dtype=np.uint8)
diff --git a/mmdet3d/datasets/dataset_wrappers.py b/mmdet3d/datasets/dataset_wrappers.py
index 2ae3327..5d0efd1 100644
--- a/mmdet3d/datasets/dataset_wrappers.py
+++ b/mmdet3d/datasets/dataset_wrappers.py
@@ -1,76 +1,76 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-
-from .builder import DATASETS
-
-
-@DATASETS.register_module()
-class CBGSDataset(object):
-    """A wrapper of class sampled dataset with ann_file path. Implementation of
-    paper `Class-balanced Grouping and Sampling for Point Cloud 3D Object
-    Detection <https://arxiv.org/abs/1908.09492.>`_.
-
-    Balance the number of scenes under different classes.
-
-    Args:
-        dataset (:obj:`CustomDataset`): The dataset to be class sampled.
-    """
-
-    def __init__(self, dataset):
-        self.dataset = dataset
-        self.CLASSES = dataset.CLASSES
-        self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
-        self.sample_indices = self._get_sample_indices()
-        # self.dataset.data_infos = self.data_infos
-        if hasattr(self.dataset, 'flag'):
-            self.flag = np.array(
-                [self.dataset.flag[ind] for ind in self.sample_indices],
-                dtype=np.uint8)
-
-    def _get_sample_indices(self):
-        """Load annotations from ann_file.
-
-        Args:
-            ann_file (str): Path of the annotation file.
-
-        Returns:
-            list[dict]: List of annotations after class sampling.
-        """
-        class_sample_idxs = {cat_id: [] for cat_id in self.cat2id.values()}
-        for idx in range(len(self.dataset)):
-            sample_cat_ids = self.dataset.get_cat_ids(idx)
-            for cat_id in sample_cat_ids:
-                class_sample_idxs[cat_id].append(idx)
-        duplicated_samples = sum(
-            [len(v) for _, v in class_sample_idxs.items()])
-        class_distribution = {
-            k: len(v) / duplicated_samples
-            for k, v in class_sample_idxs.items()
-        }
-
-        sample_indices = []
-
-        frac = 1.0 / len(self.CLASSES)
-        ratios = [frac / v for v in class_distribution.values()]
-        for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios):
-            sample_indices += np.random.choice(cls_inds,
-                                               int(len(cls_inds) *
-                                                   ratio)).tolist()
-        return sample_indices
-
-    def __getitem__(self, idx):
-        """Get item from infos according to the given index.
-
-        Returns:
-            dict: Data dictionary of the corresponding index.
-        """
-        ori_idx = self.sample_indices[idx]
-        return self.dataset[ori_idx]
-
-    def __len__(self):
-        """Return the length of data infos.
-
-        Returns:
-            int: Length of data infos.
-        """
-        return len(self.sample_indices)
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+from .builder import DATASETS
+
+
+@DATASETS.register_module()
+class CBGSDataset(object):
+    """A wrapper of class sampled dataset with ann_file path. Implementation of
+    paper `Class-balanced Grouping and Sampling for Point Cloud 3D Object
+    Detection <https://arxiv.org/abs/1908.09492.>`_.
+
+    Balance the number of scenes under different classes.
+
+    Args:
+        dataset (:obj:`CustomDataset`): The dataset to be class sampled.
+    """
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+        self.CLASSES = dataset.CLASSES
+        self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
+        self.sample_indices = self._get_sample_indices()
+        # self.dataset.data_infos = self.data_infos
+        if hasattr(self.dataset, 'flag'):
+            self.flag = np.array(
+                [self.dataset.flag[ind] for ind in self.sample_indices],
+                dtype=np.uint8)
+
+    def _get_sample_indices(self):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations after class sampling.
+        """
+        class_sample_idxs = {cat_id: [] for cat_id in self.cat2id.values()}
+        for idx in range(len(self.dataset)):
+            sample_cat_ids = self.dataset.get_cat_ids(idx)
+            for cat_id in sample_cat_ids:
+                class_sample_idxs[cat_id].append(idx)
+        duplicated_samples = sum(
+            [len(v) for _, v in class_sample_idxs.items()])
+        class_distribution = {
+            k: len(v) / duplicated_samples
+            for k, v in class_sample_idxs.items()
+        }
+
+        sample_indices = []
+
+        frac = 1.0 / len(self.CLASSES)
+        ratios = [frac / v for v in class_distribution.values()]
+        for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios):
+            sample_indices += np.random.choice(cls_inds,
+                                               int(len(cls_inds) *
+                                                   ratio)).tolist()
+        return sample_indices
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        ori_idx = self.sample_indices[idx]
+        return self.dataset[ori_idx]
+
+    def __len__(self):
+        """Return the length of data infos.
+
+        Returns:
+            int: Length of data infos.
+        """
+        return len(self.sample_indices)
diff --git a/mmdet3d/datasets/kitti2d_dataset.py b/mmdet3d/datasets/kitti2d_dataset.py
index a943932..cad59c4 100644
--- a/mmdet3d/datasets/kitti2d_dataset.py
+++ b/mmdet3d/datasets/kitti2d_dataset.py
@@ -1,241 +1,241 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
-import numpy as np
-
-from mmdet.datasets import CustomDataset
-from .builder import DATASETS
-
-
-@DATASETS.register_module()
-class Kitti2DDataset(CustomDataset):
-    r"""KITTI 2D Dataset.
-
-    This class serves as the API for experiments on the `KITTI Dataset
-    <http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d>`_.
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR'. Available options includes
-
-            - 'LiDAR': Box in LiDAR coordinates.
-            - 'Depth': Box in depth coordinates, usually for indoor dataset.
-            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-    """
-
-    CLASSES = ('car', 'pedestrian', 'cyclist')
-    """
-    Annotation format:
-    [
-        {
-            'image': {
-                'image_idx': 0,
-                'image_path': 'training/image_2/000000.png',
-                'image_shape': array([ 370, 1224], dtype=int32)
-            },
-            'point_cloud': {
-                 'num_features': 4,
-                 'velodyne_path': 'training/velodyne/000000.bin'
-             },
-             'calib': {
-                 'P0': <np.ndarray> (4, 4),
-                 'P1': <np.ndarray> (4, 4),
-                 'P2': <np.ndarray> (4, 4),
-                 'P3': <np.ndarray> (4, 4),
-                 'R0_rect':4x4 np.array,
-                 'Tr_velo_to_cam': 4x4 np.array,
-                 'Tr_imu_to_velo': 4x4 np.array
-             },
-             'annos': {
-                 'name': <np.ndarray> (n),
-                 'truncated': <np.ndarray> (n),
-                 'occluded': <np.ndarray> (n),
-                 'alpha': <np.ndarray> (n),
-                 'bbox': <np.ndarray> (n, 4),
-                 'dimensions': <np.ndarray> (n, 3),
-                 'location': <np.ndarray> (n, 3),
-                 'rotation_y': <np.ndarray> (n),
-                 'score': <np.ndarray> (n),
-                 'index': array([0], dtype=int32),
-                 'group_ids': array([0], dtype=int32),
-                 'difficulty': array([0], dtype=int32),
-                 'num_points_in_gt': <np.ndarray> (n),
-             }
-        }
-    ]
-    """
-
-    def load_annotations(self, ann_file):
-        """Load annotations from ann_file.
-
-        Args:
-            ann_file (str): Path of the annotation file.
-
-        Returns:
-            list[dict]: List of annotations.
-        """
-        self.data_infos = mmcv.load(ann_file)
-        self.cat2label = {
-            cat_name: i
-            for i, cat_name in enumerate(self.CLASSES)
-        }
-        return self.data_infos
-
-    def _filter_imgs(self, min_size=32):
-        """Filter images without ground truths."""
-        valid_inds = []
-        for i, img_info in enumerate(self.data_infos):
-            if len(img_info['annos']['name']) > 0:
-                valid_inds.append(i)
-        return valid_inds
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: Annotation information consists of the following keys:
-
-                - bboxes (np.ndarray): Ground truth bboxes.
-                - labels (np.ndarray): Labels of ground truths.
-        """
-        # Use index to get the annos, thus the evalhook could also use this api
-        info = self.data_infos[index]
-        annos = info['annos']
-        gt_names = annos['name']
-        gt_bboxes = annos['bbox']
-        difficulty = annos['difficulty']
-
-        # remove classes that is not needed
-        selected = self.keep_arrays_by_name(gt_names, self.CLASSES)
-        gt_bboxes = gt_bboxes[selected]
-        gt_names = gt_names[selected]
-        difficulty = difficulty[selected]
-        gt_labels = np.array([self.cat2label[n] for n in gt_names])
-
-        anns_results = dict(
-            bboxes=gt_bboxes.astype(np.float32),
-            labels=gt_labels,
-        )
-        return anns_results
-
-    def prepare_train_img(self, idx):
-        """Training image preparation.
-
-        Args:
-            index (int): Index for accessing the target image data.
-
-        Returns:
-            dict: Training image data dict after preprocessing
-                corresponding to the index.
-        """
-        img_raw_info = self.data_infos[idx]['image']
-        img_info = dict(filename=img_raw_info['image_path'])
-        ann_info = self.get_ann_info(idx)
-        if len(ann_info['bboxes']) == 0:
-            return None
-        results = dict(img_info=img_info, ann_info=ann_info)
-        if self.proposals is not None:
-            results['proposals'] = self.proposals[idx]
-        self.pre_pipeline(results)
-        return self.pipeline(results)
-
-    def prepare_test_img(self, idx):
-        """Prepare data for testing.
-
-        Args:
-            index (int): Index for accessing the target image data.
-
-        Returns:
-            dict: Testing image data dict after preprocessing
-                corresponding to the index.
-        """
-        img_raw_info = self.data_infos[idx]['image']
-        img_info = dict(filename=img_raw_info['image_path'])
-        results = dict(img_info=img_info)
-        if self.proposals is not None:
-            results['proposals'] = self.proposals[idx]
-        self.pre_pipeline(results)
-        return self.pipeline(results)
-
-    def drop_arrays_by_name(self, gt_names, used_classes):
-        """Drop irrelevant ground truths by name.
-
-        Args:
-            gt_names (list[str]): Names of ground truths.
-            used_classes (list[str]): Classes of interest.
-
-        Returns:
-            np.ndarray: Indices of ground truths that will be dropped.
-        """
-        inds = [i for i, x in enumerate(gt_names) if x not in used_classes]
-        inds = np.array(inds, dtype=np.int64)
-        return inds
-
-    def keep_arrays_by_name(self, gt_names, used_classes):
-        """Keep useful ground truths by name.
-
-        Args:
-            gt_names (list[str]): Names of ground truths.
-            used_classes (list[str]): Classes of interest.
-
-        Returns:
-            np.ndarray: Indices of ground truths that will be keeped.
-        """
-        inds = [i for i, x in enumerate(gt_names) if x in used_classes]
-        inds = np.array(inds, dtype=np.int64)
-        return inds
-
-    def reformat_bbox(self, outputs, out=None):
-        """Reformat bounding boxes to KITTI 2D styles.
-
-        Args:
-            outputs (list[np.ndarray]): List of arrays storing the inferenced
-                bounding boxes and scores.
-            out (str, optional): The prefix of output file.
-                Default: None.
-
-        Returns:
-            list[dict]: A list of dictionaries with the kitti 2D format.
-        """
-        from mmdet3d.core.bbox.transforms import bbox2result_kitti2d
-        sample_idx = [info['image']['image_idx'] for info in self.data_infos]
-        result_files = bbox2result_kitti2d(outputs, self.CLASSES, sample_idx,
-                                           out)
-        return result_files
-
-    def evaluate(self, result_files, eval_types=None):
-        """Evaluation in KITTI protocol.
-
-        Args:
-            result_files (str): Path of result files.
-            eval_types (str, optional): Types of evaluation. Default: None.
-                KITTI dataset only support 'bbox' evaluation type.
-
-        Returns:
-            tuple (str, dict): Average precision results in str format
-                and average precision results in dict format.
-        """
-        from mmdet3d.core.evaluation import kitti_eval
-        eval_types = ['bbox'] if not eval_types else eval_types
-        assert eval_types in ('bbox', ['bbox'
-                                       ]), 'KITTI data set only evaluate bbox'
-        gt_annos = [info['annos'] for info in self.data_infos]
-        ap_result_str, ap_dict = kitti_eval(
-            gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
-        return ap_result_str, ap_dict
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+
+from mmdet.datasets import CustomDataset
+from .builder import DATASETS
+
+
+@DATASETS.register_module()
+class Kitti2DDataset(CustomDataset):
+    r"""KITTI 2D Dataset.
+
+    This class serves as the API for experiments on the `KITTI Dataset
+    <http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d>`_.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR'. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+
+    CLASSES = ('car', 'pedestrian', 'cyclist')
+    """
+    Annotation format:
+    [
+        {
+            'image': {
+                'image_idx': 0,
+                'image_path': 'training/image_2/000000.png',
+                'image_shape': array([ 370, 1224], dtype=int32)
+            },
+            'point_cloud': {
+                 'num_features': 4,
+                 'velodyne_path': 'training/velodyne/000000.bin'
+             },
+             'calib': {
+                 'P0': <np.ndarray> (4, 4),
+                 'P1': <np.ndarray> (4, 4),
+                 'P2': <np.ndarray> (4, 4),
+                 'P3': <np.ndarray> (4, 4),
+                 'R0_rect':4x4 np.array,
+                 'Tr_velo_to_cam': 4x4 np.array,
+                 'Tr_imu_to_velo': 4x4 np.array
+             },
+             'annos': {
+                 'name': <np.ndarray> (n),
+                 'truncated': <np.ndarray> (n),
+                 'occluded': <np.ndarray> (n),
+                 'alpha': <np.ndarray> (n),
+                 'bbox': <np.ndarray> (n, 4),
+                 'dimensions': <np.ndarray> (n, 3),
+                 'location': <np.ndarray> (n, 3),
+                 'rotation_y': <np.ndarray> (n),
+                 'score': <np.ndarray> (n),
+                 'index': array([0], dtype=int32),
+                 'group_ids': array([0], dtype=int32),
+                 'difficulty': array([0], dtype=int32),
+                 'num_points_in_gt': <np.ndarray> (n),
+             }
+        }
+    ]
+    """
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations.
+        """
+        self.data_infos = mmcv.load(ann_file)
+        self.cat2label = {
+            cat_name: i
+            for i, cat_name in enumerate(self.CLASSES)
+        }
+        return self.data_infos
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images without ground truths."""
+        valid_inds = []
+        for i, img_info in enumerate(self.data_infos):
+            if len(img_info['annos']['name']) > 0:
+                valid_inds.append(i)
+        return valid_inds
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - bboxes (np.ndarray): Ground truth bboxes.
+                - labels (np.ndarray): Labels of ground truths.
+        """
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+        annos = info['annos']
+        gt_names = annos['name']
+        gt_bboxes = annos['bbox']
+        difficulty = annos['difficulty']
+
+        # remove classes that is not needed
+        selected = self.keep_arrays_by_name(gt_names, self.CLASSES)
+        gt_bboxes = gt_bboxes[selected]
+        gt_names = gt_names[selected]
+        difficulty = difficulty[selected]
+        gt_labels = np.array([self.cat2label[n] for n in gt_names])
+
+        anns_results = dict(
+            bboxes=gt_bboxes.astype(np.float32),
+            labels=gt_labels,
+        )
+        return anns_results
+
+    def prepare_train_img(self, idx):
+        """Training image preparation.
+
+        Args:
+            index (int): Index for accessing the target image data.
+
+        Returns:
+            dict: Training image data dict after preprocessing
+                corresponding to the index.
+        """
+        img_raw_info = self.data_infos[idx]['image']
+        img_info = dict(filename=img_raw_info['image_path'])
+        ann_info = self.get_ann_info(idx)
+        if len(ann_info['bboxes']) == 0:
+            return None
+        results = dict(img_info=img_info, ann_info=ann_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def prepare_test_img(self, idx):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target image data.
+
+        Returns:
+            dict: Testing image data dict after preprocessing
+                corresponding to the index.
+        """
+        img_raw_info = self.data_infos[idx]['image']
+        img_info = dict(filename=img_raw_info['image_path'])
+        results = dict(img_info=img_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def drop_arrays_by_name(self, gt_names, used_classes):
+        """Drop irrelevant ground truths by name.
+
+        Args:
+            gt_names (list[str]): Names of ground truths.
+            used_classes (list[str]): Classes of interest.
+
+        Returns:
+            np.ndarray: Indices of ground truths that will be dropped.
+        """
+        inds = [i for i, x in enumerate(gt_names) if x not in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def keep_arrays_by_name(self, gt_names, used_classes):
+        """Keep useful ground truths by name.
+
+        Args:
+            gt_names (list[str]): Names of ground truths.
+            used_classes (list[str]): Classes of interest.
+
+        Returns:
+            np.ndarray: Indices of ground truths that will be keeped.
+        """
+        inds = [i for i, x in enumerate(gt_names) if x in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def reformat_bbox(self, outputs, out=None):
+        """Reformat bounding boxes to KITTI 2D styles.
+
+        Args:
+            outputs (list[np.ndarray]): List of arrays storing the inferenced
+                bounding boxes and scores.
+            out (str, optional): The prefix of output file.
+                Default: None.
+
+        Returns:
+            list[dict]: A list of dictionaries with the kitti 2D format.
+        """
+        from mmdet3d.core.bbox.transforms import bbox2result_kitti2d
+        sample_idx = [info['image']['image_idx'] for info in self.data_infos]
+        result_files = bbox2result_kitti2d(outputs, self.CLASSES, sample_idx,
+                                           out)
+        return result_files
+
+    def evaluate(self, result_files, eval_types=None):
+        """Evaluation in KITTI protocol.
+
+        Args:
+            result_files (str): Path of result files.
+            eval_types (str, optional): Types of evaluation. Default: None.
+                KITTI dataset only support 'bbox' evaluation type.
+
+        Returns:
+            tuple (str, dict): Average precision results in str format
+                and average precision results in dict format.
+        """
+        from mmdet3d.core.evaluation import kitti_eval
+        eval_types = ['bbox'] if not eval_types else eval_types
+        assert eval_types in ('bbox', ['bbox'
+                                       ]), 'KITTI data set only evaluate bbox'
+        gt_annos = [info['annos'] for info in self.data_infos]
+        ap_result_str, ap_dict = kitti_eval(
+            gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
+        return ap_result_str, ap_dict
diff --git a/mmdet3d/datasets/kitti_dataset.py b/mmdet3d/datasets/kitti_dataset.py
index b9d0d12..dca609f 100644
--- a/mmdet3d/datasets/kitti_dataset.py
+++ b/mmdet3d/datasets/kitti_dataset.py
@@ -1,775 +1,775 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import os
-import tempfile
-from os import path as osp
-
-import mmcv
-import numpy as np
-import torch
-from mmcv.utils import print_log
-
-from ..core import show_multi_modality_result, show_result
-from ..core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
-                         LiDARInstance3DBoxes, points_cam2img)
-from .builder import DATASETS
-from .custom_3d import Custom3DDataset
-from .pipelines import Compose
-
-
-@DATASETS.register_module()
-class KittiDataset(Custom3DDataset):
-    r"""KITTI Dataset.
-
-    This class serves as the API for experiments on the `KITTI Dataset
-    <http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d>`_.
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        split (str): Split of input data.
-        pts_prefix (str, optional): Prefix of points files.
-            Defaults to 'velodyne'.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR' in this dataset. Available options includes
-
-            - 'LiDAR': Box in LiDAR coordinates.
-            - 'Depth': Box in depth coordinates, usually for indoor dataset.
-            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-        pcd_limit_range (list, optional): The range of point cloud used to
-            filter invalid predicted boxes.
-            Default: [0, -40, -3, 70.4, 40, 0.0].
-    """
-    CLASSES = ('car', 'pedestrian', 'cyclist')
-
-    def __init__(self,
-                 data_root,
-                 ann_file,
-                 split,
-                 pts_prefix='velodyne',
-                 pipeline=None,
-                 classes=None,
-                 modality=None,
-                 box_type_3d='LiDAR',
-                 filter_empty_gt=True,
-                 test_mode=False,
-                 pcd_limit_range=[0, -40, -3, 70.4, 40, 0.0],
-                 **kwargs):
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_file,
-            pipeline=pipeline,
-            classes=classes,
-            modality=modality,
-            box_type_3d=box_type_3d,
-            filter_empty_gt=filter_empty_gt,
-            test_mode=test_mode,
-            **kwargs)
-
-        self.split = split
-        self.root_split = os.path.join(self.data_root, split)
-        assert self.modality is not None
-        self.pcd_limit_range = pcd_limit_range
-        self.pts_prefix = pts_prefix
-
-    def _get_pts_filename(self, idx):
-        """Get point cloud filename according to the given index.
-
-        Args:
-            index (int): Index of the point cloud file to get.
-
-        Returns:
-            str: Name of the point cloud file.
-        """
-        pts_filename = osp.join(self.root_split, self.pts_prefix,
-                                f'{idx:06d}.bin')
-        return pts_filename
-
-    def get_data_info(self, index):
-        """Get data info according to the given index.
-
-        Args:
-            index (int): Index of the sample data to get.
-
-        Returns:
-            dict: Data information that will be passed to the data
-                preprocessing pipelines. It includes the following keys:
-
-                - sample_idx (str): Sample index.
-                - pts_filename (str): Filename of point clouds.
-                - img_prefix (str): Prefix of image files.
-                - img_info (dict): Image info.
-                - lidar2img (list[np.ndarray], optional): Transformations
-                    from lidar to different cameras.
-                - ann_info (dict): Annotation info.
-        """
-        info = self.data_infos[index]
-        sample_idx = info['image']['image_idx']
-        img_filename = os.path.join(self.data_root,
-                                    info['image']['image_path'])
-
-        # TODO: consider use torch.Tensor only
-        rect = info['calib']['R0_rect'].astype(np.float32)
-        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
-        P2 = info['calib']['P2'].astype(np.float32)
-        lidar2img = P2 @ rect @ Trv2c
-
-        pts_filename = self._get_pts_filename(sample_idx)
-        input_dict = dict(
-            sample_idx=sample_idx,
-            pts_filename=pts_filename,
-            img_prefix=None,
-            img_info=dict(filename=img_filename),
-            lidar2img=lidar2img)
-
-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-
-        return input_dict
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: annotation information consists of the following keys:
-
-                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
-                    3D ground truth bboxes.
-                - gt_labels_3d (np.ndarray): Labels of ground truths.
-                - gt_bboxes (np.ndarray): 2D ground truth bboxes.
-                - gt_labels (np.ndarray): Labels of ground truths.
-                - gt_names (list[str]): Class names of ground truths.
-                - difficulty (int): Difficulty defined by KITTI.
-                    0, 1, 2 represent xxxxx respectively.
-        """
-        # Use index to get the annos, thus the evalhook could also use this api
-        info = self.data_infos[index]
-        rect = info['calib']['R0_rect'].astype(np.float32)
-        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
-
-        if 'plane' in info:
-            # convert ground plane to velodyne coordinates
-            reverse = np.linalg.inv(rect @ Trv2c)
-
-            (plane_norm_cam,
-             plane_off_cam) = (info['plane'][:3],
-                               -info['plane'][:3] * info['plane'][3])
-            plane_norm_lidar = \
-                (reverse[:3, :3] @ plane_norm_cam[:, None])[:, 0]
-            plane_off_lidar = (
-                reverse[:3, :3] @ plane_off_cam[:, None][:, 0] +
-                reverse[:3, 3])
-            plane_lidar = np.zeros_like(plane_norm_lidar, shape=(4, ))
-            plane_lidar[:3] = plane_norm_lidar
-            plane_lidar[3] = -plane_norm_lidar.T @ plane_off_lidar
-        else:
-            plane_lidar = None
-
-        difficulty = info['annos']['difficulty']
-        annos = info['annos']
-        # we need other objects to avoid collision when sample
-        annos = self.remove_dontcare(annos)
-        loc = annos['location']
-        dims = annos['dimensions']
-        rots = annos['rotation_y']
-        gt_names = annos['name']
-        gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]],
-                                      axis=1).astype(np.float32)
-
-        # convert gt_bboxes_3d to velodyne coordinates
-        gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
-            self.box_mode_3d, np.linalg.inv(rect @ Trv2c))
-        gt_bboxes = annos['bbox']
-
-        selected = self.drop_arrays_by_name(gt_names, ['DontCare'])
-        gt_bboxes = gt_bboxes[selected].astype('float32')
-        gt_names = gt_names[selected]
-
-        gt_labels = []
-        for cat in gt_names:
-            if cat in self.CLASSES:
-                gt_labels.append(self.CLASSES.index(cat))
-            else:
-                gt_labels.append(-1)
-        gt_labels = np.array(gt_labels).astype(np.int64)
-        gt_labels_3d = copy.deepcopy(gt_labels)
-
-        anns_results = dict(
-            gt_bboxes_3d=gt_bboxes_3d,
-            gt_labels_3d=gt_labels_3d,
-            bboxes=gt_bboxes,
-            labels=gt_labels,
-            gt_names=gt_names,
-            plane=plane_lidar,
-            difficulty=difficulty,
-            rect=rect,
-            Trv2c=Trv2c)
-        return anns_results
-
-    def drop_arrays_by_name(self, gt_names, used_classes):
-        """Drop irrelevant ground truths by name.
-
-        Args:
-            gt_names (list[str]): Names of ground truths.
-            used_classes (list[str]): Classes of interest.
-
-        Returns:
-            np.ndarray: Indices of ground truths that will be dropped.
-        """
-        inds = [i for i, x in enumerate(gt_names) if x not in used_classes]
-        inds = np.array(inds, dtype=np.int64)
-        return inds
-
-    def keep_arrays_by_name(self, gt_names, used_classes):
-        """Keep useful ground truths by name.
-
-        Args:
-            gt_names (list[str]): Names of ground truths.
-            used_classes (list[str]): Classes of interest.
-
-        Returns:
-            np.ndarray: Indices of ground truths that will be keeped.
-        """
-        inds = [i for i, x in enumerate(gt_names) if x in used_classes]
-        inds = np.array(inds, dtype=np.int64)
-        return inds
-
-    def remove_dontcare(self, ann_info):
-        """Remove annotations that do not need to be cared.
-
-        Args:
-            ann_info (dict): Dict of annotation infos. The ``'DontCare'``
-                annotations will be removed according to ann_file['name'].
-
-        Returns:
-            dict: Annotations after filtering.
-        """
-        img_filtered_annotations = {}
-        relevant_annotation_indices = [
-            i for i, x in enumerate(ann_info['name']) if x != 'DontCare'
-        ]
-        for key in ann_info.keys():
-            img_filtered_annotations[key] = (
-                ann_info[key][relevant_annotation_indices])
-        return img_filtered_annotations
-
-    def format_results(self,
-                       outputs,
-                       pklfile_prefix=None,
-                       submission_prefix=None):
-        """Format the results to pkl file.
-
-        Args:
-            outputs (list[dict]): Testing results of the dataset.
-            pklfile_prefix (str): The prefix of pkl files. It includes
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-            submission_prefix (str): The prefix of submitted files. It
-                includes the file path and the prefix of filename, e.g.,
-                "a/b/prefix". If not specified, a temp file will be created.
-                Default: None.
-
-        Returns:
-            tuple: (result_files, tmp_dir), result_files is a dict containing
-                the json filepaths, tmp_dir is the temporal directory created
-                for saving json files when jsonfile_prefix is not specified.
-        """
-        if pklfile_prefix is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            pklfile_prefix = osp.join(tmp_dir.name, 'results')
-        else:
-            tmp_dir = None
-
-        if not isinstance(outputs[0], dict):
-            result_files = self.bbox2result_kitti2d(outputs, self.CLASSES,
-                                                    pklfile_prefix,
-                                                    submission_prefix)
-        elif 'pts_bbox' in outputs[0] or 'img_bbox' in outputs[0]:
-            result_files = dict()
-            for name in outputs[0]:
-                results_ = [out[name] for out in outputs]
-                pklfile_prefix_ = pklfile_prefix + name
-                if submission_prefix is not None:
-                    submission_prefix_ = submission_prefix + name
-                else:
-                    submission_prefix_ = None
-                if 'img' in name:
-                    result_files = self.bbox2result_kitti2d(
-                        results_, self.CLASSES, pklfile_prefix_,
-                        submission_prefix_)
-                else:
-                    result_files_ = self.bbox2result_kitti(
-                        results_, self.CLASSES, pklfile_prefix_,
-                        submission_prefix_)
-                result_files[name] = result_files_
-        else:
-            result_files = self.bbox2result_kitti(outputs, self.CLASSES,
-                                                  pklfile_prefix,
-                                                  submission_prefix)
-        return result_files, tmp_dir
-
-    def evaluate(self,
-                 results,
-                 metric=None,
-                 logger=None,
-                 pklfile_prefix=None,
-                 submission_prefix=None,
-                 show=False,
-                 out_dir=None,
-                 pipeline=None):
-        """Evaluation in KITTI protocol.
-
-        Args:
-            results (list[dict]): Testing results of the dataset.
-            metric (str | list[str], optional): Metrics to be evaluated.
-                Default: None.
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Default: None.
-            pklfile_prefix (str, optional): The prefix of pkl files, including
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-            submission_prefix (str, optional): The prefix of submission data.
-                If not specified, the submission data will not be generated.
-                Default: None.
-            show (bool, optional): Whether to visualize.
-                Default: False.
-            out_dir (str, optional): Path to save the visualization results.
-                Default: None.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-
-        Returns:
-            dict[str, float]: Results of each evaluation metric.
-        """
-        result_files, tmp_dir = self.format_results(results, pklfile_prefix)
-        from mmdet3d.core.evaluation import kitti_eval
-        gt_annos = [info['annos'] for info in self.data_infos]
-
-        if isinstance(result_files, dict):
-            ap_dict = dict()
-            for name, result_files_ in result_files.items():
-                eval_types = ['bbox', 'bev', '3d']
-                if 'img' in name:
-                    eval_types = ['bbox']
-                ap_result_str, ap_dict_ = kitti_eval(
-                    gt_annos,
-                    result_files_,
-                    self.CLASSES,
-                    eval_types=eval_types)
-                for ap_type, ap in ap_dict_.items():
-                    ap_dict[f'{name}/{ap_type}'] = float('{:.4f}'.format(ap))
-
-                print_log(
-                    f'Results of {name}:\n' + ap_result_str, logger=logger)
-
-        else:
-            if metric == 'img_bbox':
-                ap_result_str, ap_dict = kitti_eval(
-                    gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
-            else:
-                ap_result_str, ap_dict = kitti_eval(gt_annos, result_files,
-                                                    self.CLASSES)
-            print_log('\n' + ap_result_str, logger=logger)
-
-        if tmp_dir is not None:
-            tmp_dir.cleanup()
-        if show or out_dir:
-            self.show(results, out_dir, show=show, pipeline=pipeline)
-        return ap_dict
-
-    def bbox2result_kitti(self,
-                          net_outputs,
-                          class_names,
-                          pklfile_prefix=None,
-                          submission_prefix=None):
-        """Convert 3D detection results to kitti format for evaluation and test
-        submission.
-
-        Args:
-            net_outputs (list[np.ndarray]): List of array storing the
-                inferenced bounding boxes and scores.
-            class_names (list[String]): A list of class names.
-            pklfile_prefix (str): The prefix of pkl file.
-            submission_prefix (str): The prefix of submission file.
-
-        Returns:
-            list[dict]: A list of dictionaries with the kitti format.
-        """
-        assert len(net_outputs) == len(self.data_infos), \
-            'invalid list length of network outputs'
-        if submission_prefix is not None:
-            mmcv.mkdir_or_exist(submission_prefix)
-
-        det_annos = []
-        print('\nConverting prediction to KITTI format')
-        for idx, pred_dicts in enumerate(
-                mmcv.track_iter_progress(net_outputs)):
-            annos = []
-            info = self.data_infos[idx]
-            sample_idx = info['image']['image_idx']
-            image_shape = info['image']['image_shape'][:2]
-            box_dict = self.convert_valid_bboxes(pred_dicts, info)
-            anno = {
-                'name': [],
-                'truncated': [],
-                'occluded': [],
-                'alpha': [],
-                'bbox': [],
-                'dimensions': [],
-                'location': [],
-                'rotation_y': [],
-                'score': []
-            }
-            if len(box_dict['bbox']) > 0:
-                box_2d_preds = box_dict['bbox']
-                box_preds = box_dict['box3d_camera']
-                scores = box_dict['scores']
-                box_preds_lidar = box_dict['box3d_lidar']
-                label_preds = box_dict['label_preds']
-
-                for box, box_lidar, bbox, score, label in zip(
-                        box_preds, box_preds_lidar, box_2d_preds, scores,
-                        label_preds):
-                    bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
-                    bbox[:2] = np.maximum(bbox[:2], [0, 0])
-                    anno['name'].append(class_names[int(label)])
-                    anno['truncated'].append(0.0)
-                    anno['occluded'].append(0)
-                    anno['alpha'].append(
-                        -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6])
-                    anno['bbox'].append(bbox)
-                    anno['dimensions'].append(box[3:6])
-                    anno['location'].append(box[:3])
-                    anno['rotation_y'].append(box[6])
-                    anno['score'].append(score)
-
-                anno = {k: np.stack(v) for k, v in anno.items()}
-                annos.append(anno)
-            else:
-                anno = {
-                    'name': np.array([]),
-                    'truncated': np.array([]),
-                    'occluded': np.array([]),
-                    'alpha': np.array([]),
-                    'bbox': np.zeros([0, 4]),
-                    'dimensions': np.zeros([0, 3]),
-                    'location': np.zeros([0, 3]),
-                    'rotation_y': np.array([]),
-                    'score': np.array([]),
-                }
-                annos.append(anno)
-
-            if submission_prefix is not None:
-                curr_file = f'{submission_prefix}/{sample_idx:06d}.txt'
-                with open(curr_file, 'w') as f:
-                    bbox = anno['bbox']
-                    loc = anno['location']
-                    dims = anno['dimensions']  # lhw -> hwl
-
-                    for idx in range(len(bbox)):
-                        print(
-                            '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
-                            '{:.4f} {:.4f} {:.4f} '
-                            '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(
-                                anno['name'][idx], anno['alpha'][idx],
-                                bbox[idx][0], bbox[idx][1], bbox[idx][2],
-                                bbox[idx][3], dims[idx][1], dims[idx][2],
-                                dims[idx][0], loc[idx][0], loc[idx][1],
-                                loc[idx][2], anno['rotation_y'][idx],
-                                anno['score'][idx]),
-                            file=f)
-
-            annos[-1]['sample_idx'] = np.array(
-                [sample_idx] * len(annos[-1]['score']), dtype=np.int64)
-
-            det_annos += annos
-
-        if pklfile_prefix is not None:
-            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
-                out = f'{pklfile_prefix}.pkl'
-            mmcv.dump(det_annos, out)
-            print(f'Result is saved to {out}.')
-
-        return det_annos
-
-    def bbox2result_kitti2d(self,
-                            net_outputs,
-                            class_names,
-                            pklfile_prefix=None,
-                            submission_prefix=None):
-        """Convert 2D detection results to kitti format for evaluation and test
-        submission.
-
-        Args:
-            net_outputs (list[np.ndarray]): List of array storing the
-                inferenced bounding boxes and scores.
-            class_names (list[String]): A list of class names.
-            pklfile_prefix (str): The prefix of pkl file.
-            submission_prefix (str): The prefix of submission file.
-
-        Returns:
-            list[dict]: A list of dictionaries have the kitti format
-        """
-        assert len(net_outputs) == len(self.data_infos), \
-            'invalid list length of network outputs'
-        det_annos = []
-        print('\nConverting prediction to KITTI format')
-        for i, bboxes_per_sample in enumerate(
-                mmcv.track_iter_progress(net_outputs)):
-            annos = []
-            anno = dict(
-                name=[],
-                truncated=[],
-                occluded=[],
-                alpha=[],
-                bbox=[],
-                dimensions=[],
-                location=[],
-                rotation_y=[],
-                score=[])
-            sample_idx = self.data_infos[i]['image']['image_idx']
-
-            num_example = 0
-            for label in range(len(bboxes_per_sample)):
-                bbox = bboxes_per_sample[label]
-                for i in range(bbox.shape[0]):
-                    anno['name'].append(class_names[int(label)])
-                    anno['truncated'].append(0.0)
-                    anno['occluded'].append(0)
-                    anno['alpha'].append(0.0)
-                    anno['bbox'].append(bbox[i, :4])
-                    # set dimensions (height, width, length) to zero
-                    anno['dimensions'].append(
-                        np.zeros(shape=[3], dtype=np.float32))
-                    # set the 3D translation to (-1000, -1000, -1000)
-                    anno['location'].append(
-                        np.ones(shape=[3], dtype=np.float32) * (-1000.0))
-                    anno['rotation_y'].append(0.0)
-                    anno['score'].append(bbox[i, 4])
-                    num_example += 1
-
-            if num_example == 0:
-                annos.append(
-                    dict(
-                        name=np.array([]),
-                        truncated=np.array([]),
-                        occluded=np.array([]),
-                        alpha=np.array([]),
-                        bbox=np.zeros([0, 4]),
-                        dimensions=np.zeros([0, 3]),
-                        location=np.zeros([0, 3]),
-                        rotation_y=np.array([]),
-                        score=np.array([]),
-                    ))
-            else:
-                anno = {k: np.stack(v) for k, v in anno.items()}
-                annos.append(anno)
-
-            annos[-1]['sample_idx'] = np.array(
-                [sample_idx] * num_example, dtype=np.int64)
-            det_annos += annos
-
-        if pklfile_prefix is not None:
-            # save file in pkl format
-            pklfile_path = (
-                pklfile_prefix[:-4] if pklfile_prefix.endswith(
-                    ('.pkl', '.pickle')) else pklfile_prefix)
-            mmcv.dump(det_annos, pklfile_path)
-
-        if submission_prefix is not None:
-            # save file in submission format
-            mmcv.mkdir_or_exist(submission_prefix)
-            print(f'Saving KITTI submission to {submission_prefix}')
-            for i, anno in enumerate(det_annos):
-                sample_idx = self.data_infos[i]['image']['image_idx']
-                cur_det_file = f'{submission_prefix}/{sample_idx:06d}.txt'
-                with open(cur_det_file, 'w') as f:
-                    bbox = anno['bbox']
-                    loc = anno['location']
-                    dims = anno['dimensions'][::-1]  # lhw -> hwl
-                    for idx in range(len(bbox)):
-                        print(
-                            '{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} '
-                            '{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format(
-                                anno['name'][idx],
-                                anno['alpha'][idx],
-                                *bbox[idx],  # 4 float
-                                *dims[idx],  # 3 float
-                                *loc[idx],  # 3 float
-                                anno['rotation_y'][idx],
-                                anno['score'][idx]),
-                            file=f,
-                        )
-            print(f'Result is saved to {submission_prefix}')
-
-        return det_annos
-
-    def convert_valid_bboxes(self, box_dict, info):
-        """Convert the predicted boxes into valid ones.
-
-        Args:
-            box_dict (dict): Box dictionaries to be converted.
-
-                - boxes_3d (:obj:`LiDARInstance3DBoxes`): 3D bounding boxes.
-                - scores_3d (torch.Tensor): Scores of boxes.
-                - labels_3d (torch.Tensor): Class labels of boxes.
-            info (dict): Data info.
-
-        Returns:
-            dict: Valid predicted boxes.
-
-                - bbox (np.ndarray): 2D bounding boxes.
-                - box3d_camera (np.ndarray): 3D bounding boxes in
-                    camera coordinate.
-                - box3d_lidar (np.ndarray): 3D bounding boxes in
-                    LiDAR coordinate.
-                - scores (np.ndarray): Scores of boxes.
-                - label_preds (np.ndarray): Class label predictions.
-                - sample_idx (int): Sample index.
-        """
-        # TODO: refactor this function
-        box_preds = box_dict['boxes_3d']
-        scores = box_dict['scores_3d']
-        labels = box_dict['labels_3d']
-        sample_idx = info['image']['image_idx']
-        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
-
-        if len(box_preds) == 0:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0, 4]),
-                sample_idx=sample_idx)
-
-        rect = info['calib']['R0_rect'].astype(np.float32)
-        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
-        P2 = info['calib']['P2'].astype(np.float32)
-        img_shape = info['image']['image_shape']
-        P2 = box_preds.tensor.new_tensor(P2)
-
-        box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c)
-
-        box_corners = box_preds_camera.corners
-        box_corners_in_image = points_cam2img(box_corners, P2)
-        # box_corners_in_image: [N, 8, 2]
-        minxy = torch.min(box_corners_in_image, dim=1)[0]
-        maxxy = torch.max(box_corners_in_image, dim=1)[0]
-        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
-        # Post-processing
-        # check box_preds_camera
-        image_shape = box_preds.tensor.new_tensor(img_shape)
-        valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
-                          (box_2d_preds[:, 1] < image_shape[0]) &
-                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
-        # check box_preds
-        limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
-        valid_pcd_inds = ((box_preds.center > limit_range[:3]) &
-                          (box_preds.center < limit_range[3:]))
-        valid_inds = valid_cam_inds & valid_pcd_inds.all(-1)
-
-        if valid_inds.sum() > 0:
-            return dict(
-                bbox=box_2d_preds[valid_inds, :].numpy(),
-                box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
-                box3d_lidar=box_preds[valid_inds].tensor.numpy(),
-                scores=scores[valid_inds].numpy(),
-                label_preds=labels[valid_inds].numpy(),
-                sample_idx=sample_idx)
-        else:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0, 4]),
-                sample_idx=sample_idx)
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        pipeline = [
-            dict(
-                type='LoadPointsFromFile',
-                coord_type='LIDAR',
-                load_dim=4,
-                use_dim=4,
-                file_client_args=dict(backend='disk')),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=self.CLASSES,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ]
-        if self.modality['use_camera']:
-            pipeline.insert(0, dict(type='LoadImageFromFile'))
-        return Compose(pipeline)
-
-    def show(self, results, out_dir, show=True, pipeline=None):
-        """Results visualization.
-
-        Args:
-            results (list[dict]): List of bounding boxes results.
-            out_dir (str): Output directory of visualization result.
-            show (bool): Whether to visualize the results online.
-                Default: False.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-        """
-        assert out_dir is not None, 'Expect out_dir, got none.'
-        pipeline = self._get_pipeline(pipeline)
-        for i, result in enumerate(results):
-            if 'pts_bbox' in result.keys():
-                result = result['pts_bbox']
-            data_info = self.data_infos[i]
-            pts_path = data_info['point_cloud']['velodyne_path']
-            file_name = osp.split(pts_path)[-1].split('.')[0]
-            points, img_metas, img = self._extract_data(
-                i, pipeline, ['points', 'img_metas', 'img'])
-            points = points.numpy()
-            # for now we convert points into depth mode
-            points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
-                                               Coord3DMode.DEPTH)
-            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
-            show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
-                                               Box3DMode.DEPTH)
-            pred_bboxes = result['boxes_3d'].tensor.numpy()
-            show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
-                                                 Box3DMode.DEPTH)
-            show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir,
-                        file_name, show)
-
-            # multi-modality visualization
-            if self.modality['use_camera'] and 'lidar2img' in img_metas.keys():
-                img = img.numpy()
-                # need to transpose channel to first dim
-                img = img.transpose(1, 2, 0)
-                show_pred_bboxes = LiDARInstance3DBoxes(
-                    pred_bboxes, origin=(0.5, 0.5, 0))
-                show_gt_bboxes = LiDARInstance3DBoxes(
-                    gt_bboxes, origin=(0.5, 0.5, 0))
-                show_multi_modality_result(
-                    img,
-                    show_gt_bboxes,
-                    show_pred_bboxes,
-                    img_metas['lidar2img'],
-                    out_dir,
-                    file_name,
-                    box_mode='lidar',
-                    show=show)
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os
+import tempfile
+from os import path as osp
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.utils import print_log
+
+from ..core import show_multi_modality_result, show_result
+from ..core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
+                         LiDARInstance3DBoxes, points_cam2img)
+from .builder import DATASETS
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class KittiDataset(Custom3DDataset):
+    r"""KITTI Dataset.
+
+    This class serves as the API for experiments on the `KITTI Dataset
+    <http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d>`_.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        split (str): Split of input data.
+        pts_prefix (str, optional): Prefix of points files.
+            Defaults to 'velodyne'.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+        pcd_limit_range (list, optional): The range of point cloud used to
+            filter invalid predicted boxes.
+            Default: [0, -40, -3, 70.4, 40, 0.0].
+    """
+    CLASSES = ('car', 'pedestrian', 'cyclist')
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 split,
+                 pts_prefix='velodyne',
+                 pipeline=None,
+                 classes=None,
+                 modality=None,
+                 box_type_3d='LiDAR',
+                 filter_empty_gt=True,
+                 test_mode=False,
+                 pcd_limit_range=[0, -40, -3, 70.4, 40, 0.0],
+                 **kwargs):
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+
+        self.split = split
+        self.root_split = os.path.join(self.data_root, split)
+        assert self.modality is not None
+        self.pcd_limit_range = pcd_limit_range
+        self.pts_prefix = pts_prefix
+
+    def _get_pts_filename(self, idx):
+        """Get point cloud filename according to the given index.
+
+        Args:
+            index (int): Index of the point cloud file to get.
+
+        Returns:
+            str: Name of the point cloud file.
+        """
+        pts_filename = osp.join(self.root_split, self.pts_prefix,
+                                f'{idx:06d}.bin')
+        return pts_filename
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - img_prefix (str): Prefix of image files.
+                - img_info (dict): Image info.
+                - lidar2img (list[np.ndarray], optional): Transformations
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        sample_idx = info['image']['image_idx']
+        img_filename = os.path.join(self.data_root,
+                                    info['image']['image_path'])
+
+        # TODO: consider use torch.Tensor only
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        P2 = info['calib']['P2'].astype(np.float32)
+        lidar2img = P2 @ rect @ Trv2c
+
+        pts_filename = self._get_pts_filename(sample_idx)
+        input_dict = dict(
+            sample_idx=sample_idx,
+            pts_filename=pts_filename,
+            img_prefix=None,
+            img_info=dict(filename=img_filename),
+            lidar2img=lidar2img)
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        return input_dict
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                    3D ground truth bboxes.
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_bboxes (np.ndarray): 2D ground truth bboxes.
+                - gt_labels (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+                - difficulty (int): Difficulty defined by KITTI.
+                    0, 1, 2 represent xxxxx respectively.
+        """
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+
+        if 'plane' in info:
+            # convert ground plane to velodyne coordinates
+            reverse = np.linalg.inv(rect @ Trv2c)
+
+            (plane_norm_cam,
+             plane_off_cam) = (info['plane'][:3],
+                               -info['plane'][:3] * info['plane'][3])
+            plane_norm_lidar = \
+                (reverse[:3, :3] @ plane_norm_cam[:, None])[:, 0]
+            plane_off_lidar = (
+                reverse[:3, :3] @ plane_off_cam[:, None][:, 0] +
+                reverse[:3, 3])
+            plane_lidar = np.zeros_like(plane_norm_lidar, shape=(4, ))
+            plane_lidar[:3] = plane_norm_lidar
+            plane_lidar[3] = -plane_norm_lidar.T @ plane_off_lidar
+        else:
+            plane_lidar = None
+
+        difficulty = info['annos']['difficulty']
+        annos = info['annos']
+        # we need other objects to avoid collision when sample
+        annos = self.remove_dontcare(annos)
+        loc = annos['location']
+        dims = annos['dimensions']
+        rots = annos['rotation_y']
+        gt_names = annos['name']
+        gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1).astype(np.float32)
+
+        # convert gt_bboxes_3d to velodyne coordinates
+        gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
+            self.box_mode_3d, np.linalg.inv(rect @ Trv2c))
+        gt_bboxes = annos['bbox']
+
+        selected = self.drop_arrays_by_name(gt_names, ['DontCare'])
+        gt_bboxes = gt_bboxes[selected].astype('float32')
+        gt_names = gt_names[selected]
+
+        gt_labels = []
+        for cat in gt_names:
+            if cat in self.CLASSES:
+                gt_labels.append(self.CLASSES.index(cat))
+            else:
+                gt_labels.append(-1)
+        gt_labels = np.array(gt_labels).astype(np.int64)
+        gt_labels_3d = copy.deepcopy(gt_labels)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            gt_names=gt_names,
+            plane=plane_lidar,
+            difficulty=difficulty,
+            rect=rect,
+            Trv2c=Trv2c)
+        return anns_results
+
+    def drop_arrays_by_name(self, gt_names, used_classes):
+        """Drop irrelevant ground truths by name.
+
+        Args:
+            gt_names (list[str]): Names of ground truths.
+            used_classes (list[str]): Classes of interest.
+
+        Returns:
+            np.ndarray: Indices of ground truths that will be dropped.
+        """
+        inds = [i for i, x in enumerate(gt_names) if x not in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def keep_arrays_by_name(self, gt_names, used_classes):
+        """Keep useful ground truths by name.
+
+        Args:
+            gt_names (list[str]): Names of ground truths.
+            used_classes (list[str]): Classes of interest.
+
+        Returns:
+            np.ndarray: Indices of ground truths that will be keeped.
+        """
+        inds = [i for i, x in enumerate(gt_names) if x in used_classes]
+        inds = np.array(inds, dtype=np.int64)
+        return inds
+
+    def remove_dontcare(self, ann_info):
+        """Remove annotations that do not need to be cared.
+
+        Args:
+            ann_info (dict): Dict of annotation infos. The ``'DontCare'``
+                annotations will be removed according to ann_file['name'].
+
+        Returns:
+            dict: Annotations after filtering.
+        """
+        img_filtered_annotations = {}
+        relevant_annotation_indices = [
+            i for i, x in enumerate(ann_info['name']) if x != 'DontCare'
+        ]
+        for key in ann_info.keys():
+            img_filtered_annotations[key] = (
+                ann_info[key][relevant_annotation_indices])
+        return img_filtered_annotations
+
+    def format_results(self,
+                       outputs,
+                       pklfile_prefix=None,
+                       submission_prefix=None):
+        """Format the results to pkl file.
+
+        Args:
+            outputs (list[dict]): Testing results of the dataset.
+            pklfile_prefix (str): The prefix of pkl files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            submission_prefix (str): The prefix of submitted files. It
+                includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix". If not specified, a temp file will be created.
+                Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing
+                the json filepaths, tmp_dir is the temporal directory created
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        if pklfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            pklfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        if not isinstance(outputs[0], dict):
+            result_files = self.bbox2result_kitti2d(outputs, self.CLASSES,
+                                                    pklfile_prefix,
+                                                    submission_prefix)
+        elif 'pts_bbox' in outputs[0] or 'img_bbox' in outputs[0]:
+            result_files = dict()
+            for name in outputs[0]:
+                results_ = [out[name] for out in outputs]
+                pklfile_prefix_ = pklfile_prefix + name
+                if submission_prefix is not None:
+                    submission_prefix_ = submission_prefix + name
+                else:
+                    submission_prefix_ = None
+                if 'img' in name:
+                    result_files = self.bbox2result_kitti2d(
+                        results_, self.CLASSES, pklfile_prefix_,
+                        submission_prefix_)
+                else:
+                    result_files_ = self.bbox2result_kitti(
+                        results_, self.CLASSES, pklfile_prefix_,
+                        submission_prefix_)
+                result_files[name] = result_files_
+        else:
+            result_files = self.bbox2result_kitti(outputs, self.CLASSES,
+                                                  pklfile_prefix,
+                                                  submission_prefix)
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric=None,
+                 logger=None,
+                 pklfile_prefix=None,
+                 submission_prefix=None,
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in KITTI protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str], optional): Metrics to be evaluated.
+                Default: None.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            pklfile_prefix (str, optional): The prefix of pkl files, including
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            submission_prefix (str, optional): The prefix of submission data.
+                If not specified, the submission data will not be generated.
+                Default: None.
+            show (bool, optional): Whether to visualize.
+                Default: False.
+            out_dir (str, optional): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        result_files, tmp_dir = self.format_results(results, pklfile_prefix)
+        from mmdet3d.core.evaluation import kitti_eval
+        gt_annos = [info['annos'] for info in self.data_infos]
+
+        if isinstance(result_files, dict):
+            ap_dict = dict()
+            for name, result_files_ in result_files.items():
+                eval_types = ['bbox', 'bev', '3d']
+                if 'img' in name:
+                    eval_types = ['bbox']
+                ap_result_str, ap_dict_ = kitti_eval(
+                    gt_annos,
+                    result_files_,
+                    self.CLASSES,
+                    eval_types=eval_types)
+                for ap_type, ap in ap_dict_.items():
+                    ap_dict[f'{name}/{ap_type}'] = float('{:.4f}'.format(ap))
+
+                print_log(
+                    f'Results of {name}:\n' + ap_result_str, logger=logger)
+
+        else:
+            if metric == 'img_bbox':
+                ap_result_str, ap_dict = kitti_eval(
+                    gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
+            else:
+                ap_result_str, ap_dict = kitti_eval(gt_annos, result_files,
+                                                    self.CLASSES)
+            print_log('\n' + ap_result_str, logger=logger)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        if show or out_dir:
+            self.show(results, out_dir, show=show, pipeline=pipeline)
+        return ap_dict
+
+    def bbox2result_kitti(self,
+                          net_outputs,
+                          class_names,
+                          pklfile_prefix=None,
+                          submission_prefix=None):
+        """Convert 3D detection results to kitti format for evaluation and test
+        submission.
+
+        Args:
+            net_outputs (list[np.ndarray]): List of array storing the
+                inferenced bounding boxes and scores.
+            class_names (list[String]): A list of class names.
+            pklfile_prefix (str): The prefix of pkl file.
+            submission_prefix (str): The prefix of submission file.
+
+        Returns:
+            list[dict]: A list of dictionaries with the kitti format.
+        """
+        assert len(net_outputs) == len(self.data_infos), \
+            'invalid list length of network outputs'
+        if submission_prefix is not None:
+            mmcv.mkdir_or_exist(submission_prefix)
+
+        det_annos = []
+        print('\nConverting prediction to KITTI format')
+        for idx, pred_dicts in enumerate(
+                mmcv.track_iter_progress(net_outputs)):
+            annos = []
+            info = self.data_infos[idx]
+            sample_idx = info['image']['image_idx']
+            image_shape = info['image']['image_shape'][:2]
+            box_dict = self.convert_valid_bboxes(pred_dicts, info)
+            anno = {
+                'name': [],
+                'truncated': [],
+                'occluded': [],
+                'alpha': [],
+                'bbox': [],
+                'dimensions': [],
+                'location': [],
+                'rotation_y': [],
+                'score': []
+            }
+            if len(box_dict['bbox']) > 0:
+                box_2d_preds = box_dict['bbox']
+                box_preds = box_dict['box3d_camera']
+                scores = box_dict['scores']
+                box_preds_lidar = box_dict['box3d_lidar']
+                label_preds = box_dict['label_preds']
+
+                for box, box_lidar, bbox, score, label in zip(
+                        box_preds, box_preds_lidar, box_2d_preds, scores,
+                        label_preds):
+                    bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
+                    bbox[:2] = np.maximum(bbox[:2], [0, 0])
+                    anno['name'].append(class_names[int(label)])
+                    anno['truncated'].append(0.0)
+                    anno['occluded'].append(0)
+                    anno['alpha'].append(
+                        -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6])
+                    anno['bbox'].append(bbox)
+                    anno['dimensions'].append(box[3:6])
+                    anno['location'].append(box[:3])
+                    anno['rotation_y'].append(box[6])
+                    anno['score'].append(score)
+
+                anno = {k: np.stack(v) for k, v in anno.items()}
+                annos.append(anno)
+            else:
+                anno = {
+                    'name': np.array([]),
+                    'truncated': np.array([]),
+                    'occluded': np.array([]),
+                    'alpha': np.array([]),
+                    'bbox': np.zeros([0, 4]),
+                    'dimensions': np.zeros([0, 3]),
+                    'location': np.zeros([0, 3]),
+                    'rotation_y': np.array([]),
+                    'score': np.array([]),
+                }
+                annos.append(anno)
+
+            if submission_prefix is not None:
+                curr_file = f'{submission_prefix}/{sample_idx:06d}.txt'
+                with open(curr_file, 'w') as f:
+                    bbox = anno['bbox']
+                    loc = anno['location']
+                    dims = anno['dimensions']  # lhw -> hwl
+
+                    for idx in range(len(bbox)):
+                        print(
+                            '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
+                            '{:.4f} {:.4f} {:.4f} '
+                            '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(
+                                anno['name'][idx], anno['alpha'][idx],
+                                bbox[idx][0], bbox[idx][1], bbox[idx][2],
+                                bbox[idx][3], dims[idx][1], dims[idx][2],
+                                dims[idx][0], loc[idx][0], loc[idx][1],
+                                loc[idx][2], anno['rotation_y'][idx],
+                                anno['score'][idx]),
+                            file=f)
+
+            annos[-1]['sample_idx'] = np.array(
+                [sample_idx] * len(annos[-1]['score']), dtype=np.int64)
+
+            det_annos += annos
+
+        if pklfile_prefix is not None:
+            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
+                out = f'{pklfile_prefix}.pkl'
+            mmcv.dump(det_annos, out)
+            print(f'Result is saved to {out}.')
+
+        return det_annos
+
+    def bbox2result_kitti2d(self,
+                            net_outputs,
+                            class_names,
+                            pklfile_prefix=None,
+                            submission_prefix=None):
+        """Convert 2D detection results to kitti format for evaluation and test
+        submission.
+
+        Args:
+            net_outputs (list[np.ndarray]): List of array storing the
+                inferenced bounding boxes and scores.
+            class_names (list[String]): A list of class names.
+            pklfile_prefix (str): The prefix of pkl file.
+            submission_prefix (str): The prefix of submission file.
+
+        Returns:
+            list[dict]: A list of dictionaries have the kitti format
+        """
+        assert len(net_outputs) == len(self.data_infos), \
+            'invalid list length of network outputs'
+        det_annos = []
+        print('\nConverting prediction to KITTI format')
+        for i, bboxes_per_sample in enumerate(
+                mmcv.track_iter_progress(net_outputs)):
+            annos = []
+            anno = dict(
+                name=[],
+                truncated=[],
+                occluded=[],
+                alpha=[],
+                bbox=[],
+                dimensions=[],
+                location=[],
+                rotation_y=[],
+                score=[])
+            sample_idx = self.data_infos[i]['image']['image_idx']
+
+            num_example = 0
+            for label in range(len(bboxes_per_sample)):
+                bbox = bboxes_per_sample[label]
+                for i in range(bbox.shape[0]):
+                    anno['name'].append(class_names[int(label)])
+                    anno['truncated'].append(0.0)
+                    anno['occluded'].append(0)
+                    anno['alpha'].append(0.0)
+                    anno['bbox'].append(bbox[i, :4])
+                    # set dimensions (height, width, length) to zero
+                    anno['dimensions'].append(
+                        np.zeros(shape=[3], dtype=np.float32))
+                    # set the 3D translation to (-1000, -1000, -1000)
+                    anno['location'].append(
+                        np.ones(shape=[3], dtype=np.float32) * (-1000.0))
+                    anno['rotation_y'].append(0.0)
+                    anno['score'].append(bbox[i, 4])
+                    num_example += 1
+
+            if num_example == 0:
+                annos.append(
+                    dict(
+                        name=np.array([]),
+                        truncated=np.array([]),
+                        occluded=np.array([]),
+                        alpha=np.array([]),
+                        bbox=np.zeros([0, 4]),
+                        dimensions=np.zeros([0, 3]),
+                        location=np.zeros([0, 3]),
+                        rotation_y=np.array([]),
+                        score=np.array([]),
+                    ))
+            else:
+                anno = {k: np.stack(v) for k, v in anno.items()}
+                annos.append(anno)
+
+            annos[-1]['sample_idx'] = np.array(
+                [sample_idx] * num_example, dtype=np.int64)
+            det_annos += annos
+
+        if pklfile_prefix is not None:
+            # save file in pkl format
+            pklfile_path = (
+                pklfile_prefix[:-4] if pklfile_prefix.endswith(
+                    ('.pkl', '.pickle')) else pklfile_prefix)
+            mmcv.dump(det_annos, pklfile_path)
+
+        if submission_prefix is not None:
+            # save file in submission format
+            mmcv.mkdir_or_exist(submission_prefix)
+            print(f'Saving KITTI submission to {submission_prefix}')
+            for i, anno in enumerate(det_annos):
+                sample_idx = self.data_infos[i]['image']['image_idx']
+                cur_det_file = f'{submission_prefix}/{sample_idx:06d}.txt'
+                with open(cur_det_file, 'w') as f:
+                    bbox = anno['bbox']
+                    loc = anno['location']
+                    dims = anno['dimensions'][::-1]  # lhw -> hwl
+                    for idx in range(len(bbox)):
+                        print(
+                            '{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} '
+                            '{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format(
+                                anno['name'][idx],
+                                anno['alpha'][idx],
+                                *bbox[idx],  # 4 float
+                                *dims[idx],  # 3 float
+                                *loc[idx],  # 3 float
+                                anno['rotation_y'][idx],
+                                anno['score'][idx]),
+                            file=f,
+                        )
+            print(f'Result is saved to {submission_prefix}')
+
+        return det_annos
+
+    def convert_valid_bboxes(self, box_dict, info):
+        """Convert the predicted boxes into valid ones.
+
+        Args:
+            box_dict (dict): Box dictionaries to be converted.
+
+                - boxes_3d (:obj:`LiDARInstance3DBoxes`): 3D bounding boxes.
+                - scores_3d (torch.Tensor): Scores of boxes.
+                - labels_3d (torch.Tensor): Class labels of boxes.
+            info (dict): Data info.
+
+        Returns:
+            dict: Valid predicted boxes.
+
+                - bbox (np.ndarray): 2D bounding boxes.
+                - box3d_camera (np.ndarray): 3D bounding boxes in
+                    camera coordinate.
+                - box3d_lidar (np.ndarray): 3D bounding boxes in
+                    LiDAR coordinate.
+                - scores (np.ndarray): Scores of boxes.
+                - label_preds (np.ndarray): Class label predictions.
+                - sample_idx (int): Sample index.
+        """
+        # TODO: refactor this function
+        box_preds = box_dict['boxes_3d']
+        scores = box_dict['scores_3d']
+        labels = box_dict['labels_3d']
+        sample_idx = info['image']['image_idx']
+        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
+
+        if len(box_preds) == 0:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
+                sample_idx=sample_idx)
+
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        P2 = info['calib']['P2'].astype(np.float32)
+        img_shape = info['image']['image_shape']
+        P2 = box_preds.tensor.new_tensor(P2)
+
+        box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c)
+
+        box_corners = box_preds_camera.corners
+        box_corners_in_image = points_cam2img(box_corners, P2)
+        # box_corners_in_image: [N, 8, 2]
+        minxy = torch.min(box_corners_in_image, dim=1)[0]
+        maxxy = torch.max(box_corners_in_image, dim=1)[0]
+        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
+        # Post-processing
+        # check box_preds_camera
+        image_shape = box_preds.tensor.new_tensor(img_shape)
+        valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
+                          (box_2d_preds[:, 1] < image_shape[0]) &
+                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
+        # check box_preds
+        limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
+        valid_pcd_inds = ((box_preds.center > limit_range[:3]) &
+                          (box_preds.center < limit_range[3:]))
+        valid_inds = valid_cam_inds & valid_pcd_inds.all(-1)
+
+        if valid_inds.sum() > 0:
+            return dict(
+                bbox=box_2d_preds[valid_inds, :].numpy(),
+                box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
+                box3d_lidar=box_preds[valid_inds].tensor.numpy(),
+                scores=scores[valid_inds].numpy(),
+                label_preds=labels[valid_inds].numpy(),
+                sample_idx=sample_idx)
+        else:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
+                sample_idx=sample_idx)
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='LIDAR',
+                load_dim=4,
+                use_dim=4,
+                file_client_args=dict(backend='disk')),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ]
+        if self.modality['use_camera']:
+            pipeline.insert(0, dict(type='LoadImageFromFile'))
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=True, pipeline=None):
+        """Results visualization.
+
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Whether to visualize the results online.
+                Default: False.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            if 'pts_bbox' in result.keys():
+                result = result['pts_bbox']
+            data_info = self.data_infos[i]
+            pts_path = data_info['point_cloud']['velodyne_path']
+            file_name = osp.split(pts_path)[-1].split('.')[0]
+            points, img_metas, img = self._extract_data(
+                i, pipeline, ['points', 'img_metas', 'img'])
+            points = points.numpy()
+            # for now we convert points into depth mode
+            points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
+                                               Coord3DMode.DEPTH)
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
+            show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
+                                               Box3DMode.DEPTH)
+            pred_bboxes = result['boxes_3d'].tensor.numpy()
+            show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
+                                                 Box3DMode.DEPTH)
+            show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir,
+                        file_name, show)
+
+            # multi-modality visualization
+            if self.modality['use_camera'] and 'lidar2img' in img_metas.keys():
+                img = img.numpy()
+                # need to transpose channel to first dim
+                img = img.transpose(1, 2, 0)
+                show_pred_bboxes = LiDARInstance3DBoxes(
+                    pred_bboxes, origin=(0.5, 0.5, 0))
+                show_gt_bboxes = LiDARInstance3DBoxes(
+                    gt_bboxes, origin=(0.5, 0.5, 0))
+                show_multi_modality_result(
+                    img,
+                    show_gt_bboxes,
+                    show_pred_bboxes,
+                    img_metas['lidar2img'],
+                    out_dir,
+                    file_name,
+                    box_mode='lidar',
+                    show=show)
diff --git a/mmdet3d/datasets/kitti_mono_dataset.py b/mmdet3d/datasets/kitti_mono_dataset.py
index c669b0a..3aed662 100644
--- a/mmdet3d/datasets/kitti_mono_dataset.py
+++ b/mmdet3d/datasets/kitti_mono_dataset.py
@@ -1,569 +1,569 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import tempfile
-from os import path as osp
-
-import mmcv
-import numpy as np
-import torch
-from mmcv.utils import print_log
-
-from ..core.bbox import Box3DMode, CameraInstance3DBoxes, points_cam2img
-from .builder import DATASETS
-from .nuscenes_mono_dataset import NuScenesMonoDataset
-
-
-@DATASETS.register_module()
-class KittiMonoDataset(NuScenesMonoDataset):
-    """Monocular 3D detection on KITTI Dataset.
-
-    Args:
-        data_root (str): Path of dataset root.
-        info_file (str): Path of info file.
-        load_interval (int, optional): Interval of loading the dataset. It is
-            used to uniformly sample the dataset. Defaults to 1.
-        with_velocity (bool, optional): Whether include velocity prediction
-            into the experiments. Defaults to False.
-        eval_version (str, optional): Configuration version of evaluation.
-            Defaults to None.
-        version (str, optional): Dataset version. Defaults to None.
-        kwargs (dict): Other arguments are the same of NuScenesMonoDataset.
-    """
-
-    CLASSES = ('Pedestrian', 'Cyclist', 'Car')
-
-    def __init__(self,
-                 data_root,
-                 info_file,
-                 ann_file,
-                 pipeline,
-                 load_interval=1,
-                 with_velocity=False,
-                 eval_version=None,
-                 version=None,
-                 **kwargs):
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_file,
-            pipeline=pipeline,
-            load_interval=load_interval,
-            with_velocity=with_velocity,
-            eval_version=eval_version,
-            version=version,
-            **kwargs)
-        self.anno_infos = mmcv.load(info_file)
-        self.bbox_code_size = 7
-
-    def _parse_ann_info(self, img_info, ann_info):
-        """Parse bbox and mask annotation.
-
-        Args:
-            ann_info (list[dict]): Annotation info of an image.
-            with_mask (bool): Whether to parse mask annotations.
-
-        Returns:
-            dict: A dict containing the following keys: bboxes, bboxes_ignore,
-                labels, masks, seg_map. "masks" are raw annotations and not
-                decoded into binary masks.
-        """
-        gt_bboxes = []
-        gt_labels = []
-        gt_bboxes_ignore = []
-        gt_masks_ann = []
-        gt_bboxes_cam3d = []
-        centers2d = []
-        depths = []
-        for i, ann in enumerate(ann_info):
-            if ann.get('ignore', False):
-                continue
-            x1, y1, w, h = ann['bbox']
-            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
-            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
-            if inter_w * inter_h == 0:
-                continue
-            if ann['area'] <= 0 or w < 1 or h < 1:
-                continue
-            if ann['category_id'] not in self.cat_ids:
-                continue
-            bbox = [x1, y1, x1 + w, y1 + h]
-            if ann.get('iscrowd', False):
-                gt_bboxes_ignore.append(bbox)
-            else:
-                gt_bboxes.append(bbox)
-                gt_labels.append(self.cat2label[ann['category_id']])
-                gt_masks_ann.append(ann.get('segmentation', None))
-                # 3D annotations in camera coordinates
-                bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(-1, )
-                gt_bboxes_cam3d.append(bbox_cam3d)
-                # 2.5D annotations in camera coordinates
-                center2d = ann['center2d'][:2]
-                depth = ann['center2d'][2]
-                centers2d.append(center2d)
-                depths.append(depth)
-
-        if gt_bboxes:
-            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
-            gt_labels = np.array(gt_labels, dtype=np.int64)
-        else:
-            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
-            gt_labels = np.array([], dtype=np.int64)
-
-        if gt_bboxes_cam3d:
-            gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32)
-            centers2d = np.array(centers2d, dtype=np.float32)
-            depths = np.array(depths, dtype=np.float32)
-        else:
-            gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size),
-                                       dtype=np.float32)
-            centers2d = np.zeros((0, 2), dtype=np.float32)
-            depths = np.zeros((0), dtype=np.float32)
-
-        gt_bboxes_cam3d = CameraInstance3DBoxes(
-            gt_bboxes_cam3d,
-            box_dim=gt_bboxes_cam3d.shape[-1],
-            origin=(0.5, 0.5, 0.5))
-        gt_labels_3d = copy.deepcopy(gt_labels)
-
-        if gt_bboxes_ignore:
-            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
-        else:
-            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
-
-        seg_map = img_info['filename'].replace('jpg', 'png')
-
-        ann = dict(
-            bboxes=gt_bboxes,
-            labels=gt_labels,
-            gt_bboxes_3d=gt_bboxes_cam3d,
-            gt_labels_3d=gt_labels_3d,
-            centers2d=centers2d,
-            depths=depths,
-            bboxes_ignore=gt_bboxes_ignore,
-            masks=gt_masks_ann,
-            seg_map=seg_map)
-
-        return ann
-
-    def format_results(self,
-                       outputs,
-                       pklfile_prefix=None,
-                       submission_prefix=None):
-        """Format the results to pkl file.
-
-        Args:
-            outputs (list[dict]): Testing results of the dataset.
-            pklfile_prefix (str): The prefix of pkl files. It includes
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-            submission_prefix (str): The prefix of submitted files. It
-                includes the file path and the prefix of filename, e.g.,
-                "a/b/prefix". If not specified, a temp file will be created.
-                Default: None.
-
-        Returns:
-            tuple: (result_files, tmp_dir), result_files is a dict containing
-                the json filepaths, tmp_dir is the temporal directory created
-                for saving json files when jsonfile_prefix is not specified.
-        """
-        if pklfile_prefix is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            pklfile_prefix = osp.join(tmp_dir.name, 'results')
-        else:
-            tmp_dir = None
-
-        if not isinstance(outputs[0], dict):
-            result_files = self.bbox2result_kitti2d(outputs, self.CLASSES,
-                                                    pklfile_prefix,
-                                                    submission_prefix)
-        elif 'pts_bbox' in outputs[0] or 'img_bbox' in outputs[0] or \
-                'img_bbox2d' in outputs[0]:
-            result_files = dict()
-            for name in outputs[0]:
-                results_ = [out[name] for out in outputs]
-                pklfile_prefix_ = pklfile_prefix + name
-                if submission_prefix is not None:
-                    submission_prefix_ = submission_prefix + name
-                else:
-                    submission_prefix_ = None
-                if '2d' in name:
-                    result_files_ = self.bbox2result_kitti2d(
-                        results_, self.CLASSES, pklfile_prefix_,
-                        submission_prefix_)
-                else:
-                    result_files_ = self.bbox2result_kitti(
-                        results_, self.CLASSES, pklfile_prefix_,
-                        submission_prefix_)
-                result_files[name] = result_files_
-        else:
-            result_files = self.bbox2result_kitti(outputs, self.CLASSES,
-                                                  pklfile_prefix,
-                                                  submission_prefix)
-        return result_files, tmp_dir
-
-    def evaluate(self,
-                 results,
-                 metric=None,
-                 logger=None,
-                 pklfile_prefix=None,
-                 submission_prefix=None,
-                 show=False,
-                 out_dir=None,
-                 pipeline=None):
-        """Evaluation in KITTI protocol.
-
-        Args:
-            results (list[dict]): Testing results of the dataset.
-            metric (str | list[str], optional): Metrics to be evaluated.
-                Defaults to None.
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Default: None.
-            pklfile_prefix (str, optional): The prefix of pkl files, including
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-            submission_prefix (str, optional): The prefix of submission data.
-                If not specified, the submission data will not be generated.
-            show (bool, optional): Whether to visualize.
-                Default: False.
-            out_dir (str, optional): Path to save the visualization results.
-                Default: None.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-
-        Returns:
-            dict[str, float]: Results of each evaluation metric.
-        """
-        result_files, tmp_dir = self.format_results(results, pklfile_prefix)
-        from mmdet3d.core.evaluation import kitti_eval
-        gt_annos = [info['annos'] for info in self.anno_infos]
-
-        if isinstance(result_files, dict):
-            ap_dict = dict()
-            for name, result_files_ in result_files.items():
-                eval_types = ['bbox', 'bev', '3d']
-                if '2d' in name:
-                    eval_types = ['bbox']
-                ap_result_str, ap_dict_ = kitti_eval(
-                    gt_annos,
-                    result_files_,
-                    self.CLASSES,
-                    eval_types=eval_types)
-                for ap_type, ap in ap_dict_.items():
-                    ap_dict[f'{name}/{ap_type}'] = float('{:.4f}'.format(ap))
-
-                print_log(
-                    f'Results of {name}:\n' + ap_result_str, logger=logger)
-
-        else:
-            if metric == 'img_bbox2d':
-                ap_result_str, ap_dict = kitti_eval(
-                    gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
-            else:
-                ap_result_str, ap_dict = kitti_eval(gt_annos, result_files,
-                                                    self.CLASSES)
-            print_log('\n' + ap_result_str, logger=logger)
-
-        if tmp_dir is not None:
-            tmp_dir.cleanup()
-        if show or out_dir:
-            self.show(results, out_dir, show=show, pipeline=pipeline)
-        return ap_dict
-
-    def bbox2result_kitti(self,
-                          net_outputs,
-                          class_names,
-                          pklfile_prefix=None,
-                          submission_prefix=None):
-        """Convert 3D detection results to kitti format for evaluation and test
-        submission.
-
-        Args:
-            net_outputs (list[np.ndarray]): List of array storing the
-                inferenced bounding boxes and scores.
-            class_names (list[String]): A list of class names.
-            pklfile_prefix (str): The prefix of pkl file.
-            submission_prefix (str): The prefix of submission file.
-
-        Returns:
-            list[dict]: A list of dictionaries with the kitti format.
-        """
-        assert len(net_outputs) == len(self.anno_infos)
-        if submission_prefix is not None:
-            mmcv.mkdir_or_exist(submission_prefix)
-
-        det_annos = []
-        print('\nConverting prediction to KITTI format')
-        for idx, pred_dicts in enumerate(
-                mmcv.track_iter_progress(net_outputs)):
-            annos = []
-            info = self.anno_infos[idx]
-            sample_idx = info['image']['image_idx']
-            image_shape = info['image']['image_shape'][:2]
-
-            box_dict = self.convert_valid_bboxes(pred_dicts, info)
-            anno = {
-                'name': [],
-                'truncated': [],
-                'occluded': [],
-                'alpha': [],
-                'bbox': [],
-                'dimensions': [],
-                'location': [],
-                'rotation_y': [],
-                'score': []
-            }
-            if len(box_dict['bbox']) > 0:
-                box_2d_preds = box_dict['bbox']
-                box_preds = box_dict['box3d_camera']
-                scores = box_dict['scores']
-                box_preds_lidar = box_dict['box3d_lidar']
-                label_preds = box_dict['label_preds']
-
-                for box, box_lidar, bbox, score, label in zip(
-                        box_preds, box_preds_lidar, box_2d_preds, scores,
-                        label_preds):
-                    bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
-                    bbox[:2] = np.maximum(bbox[:2], [0, 0])
-                    anno['name'].append(class_names[int(label)])
-                    anno['truncated'].append(0.0)
-                    anno['occluded'].append(0)
-                    anno['alpha'].append(-np.arctan2(box[0], box[2]) + box[6])
-                    anno['bbox'].append(bbox)
-                    anno['dimensions'].append(box[3:6])
-                    anno['location'].append(box[:3])
-                    anno['rotation_y'].append(box[6])
-                    anno['score'].append(score)
-
-                anno = {k: np.stack(v) for k, v in anno.items()}
-                annos.append(anno)
-
-            else:
-                anno = {
-                    'name': np.array([]),
-                    'truncated': np.array([]),
-                    'occluded': np.array([]),
-                    'alpha': np.array([]),
-                    'bbox': np.zeros([0, 4]),
-                    'dimensions': np.zeros([0, 3]),
-                    'location': np.zeros([0, 3]),
-                    'rotation_y': np.array([]),
-                    'score': np.array([]),
-                }
-                annos.append(anno)
-
-            if submission_prefix is not None:
-                curr_file = f'{submission_prefix}/{sample_idx:06d}.txt'
-                with open(curr_file, 'w') as f:
-                    bbox = anno['bbox']
-                    loc = anno['location']
-                    dims = anno['dimensions']  # lhw -> hwl
-
-                    for idx in range(len(bbox)):
-                        print(
-                            '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
-                            '{:.4f} {:.4f} {:.4f} '
-                            '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(
-                                anno['name'][idx], anno['alpha'][idx],
-                                bbox[idx][0], bbox[idx][1], bbox[idx][2],
-                                bbox[idx][3], dims[idx][1], dims[idx][2],
-                                dims[idx][0], loc[idx][0], loc[idx][1],
-                                loc[idx][2], anno['rotation_y'][idx],
-                                anno['score'][idx]),
-                            file=f)
-
-            annos[-1]['sample_idx'] = np.array(
-                [sample_idx] * len(annos[-1]['score']), dtype=np.int64)
-
-            det_annos += annos
-
-        if pklfile_prefix is not None:
-            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
-                out = f'{pklfile_prefix}.pkl'
-            mmcv.dump(det_annos, out)
-            print('Result is saved to %s' % out)
-
-        return det_annos
-
-    def bbox2result_kitti2d(self,
-                            net_outputs,
-                            class_names,
-                            pklfile_prefix=None,
-                            submission_prefix=None):
-        """Convert 2D detection results to kitti format for evaluation and test
-        submission.
-
-        Args:
-            net_outputs (list[np.ndarray]): List of array storing the
-                inferenced bounding boxes and scores.
-            class_names (list[String]): A list of class names.
-            pklfile_prefix (str): The prefix of pkl file.
-            submission_prefix (str): The prefix of submission file.
-
-        Returns:
-            list[dict]: A list of dictionaries have the kitti format
-        """
-        assert len(net_outputs) == len(self.anno_infos)
-
-        det_annos = []
-        print('\nConverting prediction to KITTI format')
-        for i, bboxes_per_sample in enumerate(
-                mmcv.track_iter_progress(net_outputs)):
-            annos = []
-            anno = dict(
-                name=[],
-                truncated=[],
-                occluded=[],
-                alpha=[],
-                bbox=[],
-                dimensions=[],
-                location=[],
-                rotation_y=[],
-                score=[])
-            sample_idx = self.anno_infos[i]['image']['image_idx']
-
-            num_example = 0
-            for label in range(len(bboxes_per_sample)):
-                bbox = bboxes_per_sample[label]
-                for i in range(bbox.shape[0]):
-                    anno['name'].append(class_names[int(label)])
-                    anno['truncated'].append(0.0)
-                    anno['occluded'].append(0)
-                    anno['alpha'].append(-10)
-                    anno['bbox'].append(bbox[i, :4])
-                    # set dimensions (height, width, length) to zero
-                    anno['dimensions'].append(
-                        np.zeros(shape=[3], dtype=np.float32))
-                    # set the 3D translation to (-1000, -1000, -1000)
-                    anno['location'].append(
-                        np.ones(shape=[3], dtype=np.float32) * (-1000.0))
-                    anno['rotation_y'].append(0.0)
-                    anno['score'].append(bbox[i, 4])
-                    num_example += 1
-
-            if num_example == 0:
-                annos.append(
-                    dict(
-                        name=np.array([]),
-                        truncated=np.array([]),
-                        occluded=np.array([]),
-                        alpha=np.array([]),
-                        bbox=np.zeros([0, 4]),
-                        dimensions=np.zeros([0, 3]),
-                        location=np.zeros([0, 3]),
-                        rotation_y=np.array([]),
-                        score=np.array([]),
-                    ))
-            else:
-                anno = {k: np.stack(v) for k, v in anno.items()}
-                annos.append(anno)
-
-            annos[-1]['sample_idx'] = np.array(
-                [sample_idx] * num_example, dtype=np.int64)
-            det_annos += annos
-
-        if pklfile_prefix is not None:
-            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
-                out = f'{pklfile_prefix}.pkl'
-            mmcv.dump(det_annos, out)
-            print('Result is saved to %s' % out)
-
-        if submission_prefix is not None:
-            # save file in submission format
-            mmcv.mkdir_or_exist(submission_prefix)
-            print(f'Saving KITTI submission to {submission_prefix}')
-            for i, anno in enumerate(det_annos):
-                sample_idx = self.anno_infos[i]['image']['image_idx']
-                cur_det_file = f'{submission_prefix}/{sample_idx:06d}.txt'
-                with open(cur_det_file, 'w') as f:
-                    bbox = anno['bbox']
-                    loc = anno['location']
-                    dims = anno['dimensions'][::-1]  # lhw -> hwl
-                    for idx in range(len(bbox)):
-                        print(
-                            '{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} '
-                            '{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format(
-                                anno['name'][idx],
-                                anno['alpha'][idx],
-                                *bbox[idx],  # 4 float
-                                *dims[idx],  # 3 float
-                                *loc[idx],  # 3 float
-                                anno['rotation_y'][idx],
-                                anno['score'][idx]),
-                            file=f,
-                        )
-            print(f'Result is saved to {submission_prefix}')
-
-        return det_annos
-
-    def convert_valid_bboxes(self, box_dict, info):
-        """Convert the predicted boxes into valid ones.
-
-        Args:
-            box_dict (dict): Box dictionaries to be converted.
-                - boxes_3d (:obj:`CameraInstance3DBoxes`): 3D bounding boxes.
-                - scores_3d (torch.Tensor): Scores of boxes.
-                - labels_3d (torch.Tensor): Class labels of boxes.
-            info (dict): Data info.
-
-        Returns:
-            dict: Valid predicted boxes.
-                - bbox (np.ndarray): 2D bounding boxes.
-                - box3d_camera (np.ndarray): 3D bounding boxes in
-                    camera coordinate.
-                - scores (np.ndarray): Scores of boxes.
-                - label_preds (np.ndarray): Class label predictions.
-                - sample_idx (int): Sample index.
-        """
-        box_preds = box_dict['boxes_3d']
-        scores = box_dict['scores_3d']
-        labels = box_dict['labels_3d']
-        sample_idx = info['image']['image_idx']
-
-        if len(box_preds) == 0:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                box3d_camera=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0, 4]),
-                sample_idx=sample_idx)
-
-        rect = info['calib']['R0_rect'].astype(np.float32)
-        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
-        P2 = info['calib']['P2'].astype(np.float32)
-        img_shape = info['image']['image_shape']
-        P2 = box_preds.tensor.new_tensor(P2)
-
-        box_preds_camera = box_preds
-        box_preds_lidar = box_preds.convert_to(Box3DMode.LIDAR,
-                                               np.linalg.inv(rect @ Trv2c))
-
-        box_corners = box_preds_camera.corners
-        box_corners_in_image = points_cam2img(box_corners, P2)
-        # box_corners_in_image: [N, 8, 2]
-        minxy = torch.min(box_corners_in_image, dim=1)[0]
-        maxxy = torch.max(box_corners_in_image, dim=1)[0]
-        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
-        # Post-processing
-        # check box_preds_camera
-        image_shape = box_preds.tensor.new_tensor(img_shape)
-        valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
-                          (box_2d_preds[:, 1] < image_shape[0]) &
-                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
-        # check box_preds
-        valid_inds = valid_cam_inds
-
-        if valid_inds.sum() > 0:
-            return dict(
-                bbox=box_2d_preds[valid_inds, :].numpy(),
-                box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
-                box3d_lidar=box_preds_lidar[valid_inds].tensor.numpy(),
-                scores=scores[valid_inds].numpy(),
-                label_preds=labels[valid_inds].numpy(),
-                sample_idx=sample_idx)
-        else:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0, 4]),
-                sample_idx=sample_idx)
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import tempfile
+from os import path as osp
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.utils import print_log
+
+from ..core.bbox import Box3DMode, CameraInstance3DBoxes, points_cam2img
+from .builder import DATASETS
+from .nuscenes_mono_dataset import NuScenesMonoDataset
+
+
+@DATASETS.register_module()
+class KittiMonoDataset(NuScenesMonoDataset):
+    """Monocular 3D detection on KITTI Dataset.
+
+    Args:
+        data_root (str): Path of dataset root.
+        info_file (str): Path of info file.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        with_velocity (bool, optional): Whether include velocity prediction
+            into the experiments. Defaults to False.
+        eval_version (str, optional): Configuration version of evaluation.
+            Defaults to None.
+        version (str, optional): Dataset version. Defaults to None.
+        kwargs (dict): Other arguments are the same of NuScenesMonoDataset.
+    """
+
+    CLASSES = ('Pedestrian', 'Cyclist', 'Car')
+
+    def __init__(self,
+                 data_root,
+                 info_file,
+                 ann_file,
+                 pipeline,
+                 load_interval=1,
+                 with_velocity=False,
+                 eval_version=None,
+                 version=None,
+                 **kwargs):
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            load_interval=load_interval,
+            with_velocity=with_velocity,
+            eval_version=eval_version,
+            version=version,
+            **kwargs)
+        self.anno_infos = mmcv.load(info_file)
+        self.bbox_code_size = 7
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox and mask annotation.
+
+        Args:
+            ann_info (list[dict]): Annotation info of an image.
+            with_mask (bool): Whether to parse mask annotations.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, bboxes_ignore,
+                labels, masks, seg_map. "masks" are raw annotations and not
+                decoded into binary masks.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+        gt_bboxes_cam3d = []
+        centers2d = []
+        depths = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                gt_masks_ann.append(ann.get('segmentation', None))
+                # 3D annotations in camera coordinates
+                bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(-1, )
+                gt_bboxes_cam3d.append(bbox_cam3d)
+                # 2.5D annotations in camera coordinates
+                center2d = ann['center2d'][:2]
+                depth = ann['center2d'][2]
+                centers2d.append(center2d)
+                depths.append(depth)
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_cam3d:
+            gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32)
+            centers2d = np.array(centers2d, dtype=np.float32)
+            depths = np.array(depths, dtype=np.float32)
+        else:
+            gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size),
+                                       dtype=np.float32)
+            centers2d = np.zeros((0, 2), dtype=np.float32)
+            depths = np.zeros((0), dtype=np.float32)
+
+        gt_bboxes_cam3d = CameraInstance3DBoxes(
+            gt_bboxes_cam3d,
+            box_dim=gt_bboxes_cam3d.shape[-1],
+            origin=(0.5, 0.5, 0.5))
+        gt_labels_3d = copy.deepcopy(gt_labels)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        seg_map = img_info['filename'].replace('jpg', 'png')
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            gt_bboxes_3d=gt_bboxes_cam3d,
+            gt_labels_3d=gt_labels_3d,
+            centers2d=centers2d,
+            depths=depths,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=seg_map)
+
+        return ann
+
+    def format_results(self,
+                       outputs,
+                       pklfile_prefix=None,
+                       submission_prefix=None):
+        """Format the results to pkl file.
+
+        Args:
+            outputs (list[dict]): Testing results of the dataset.
+            pklfile_prefix (str): The prefix of pkl files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            submission_prefix (str): The prefix of submitted files. It
+                includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix". If not specified, a temp file will be created.
+                Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing
+                the json filepaths, tmp_dir is the temporal directory created
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        if pklfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            pklfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        if not isinstance(outputs[0], dict):
+            result_files = self.bbox2result_kitti2d(outputs, self.CLASSES,
+                                                    pklfile_prefix,
+                                                    submission_prefix)
+        elif 'pts_bbox' in outputs[0] or 'img_bbox' in outputs[0] or \
+                'img_bbox2d' in outputs[0]:
+            result_files = dict()
+            for name in outputs[0]:
+                results_ = [out[name] for out in outputs]
+                pklfile_prefix_ = pklfile_prefix + name
+                if submission_prefix is not None:
+                    submission_prefix_ = submission_prefix + name
+                else:
+                    submission_prefix_ = None
+                if '2d' in name:
+                    result_files_ = self.bbox2result_kitti2d(
+                        results_, self.CLASSES, pklfile_prefix_,
+                        submission_prefix_)
+                else:
+                    result_files_ = self.bbox2result_kitti(
+                        results_, self.CLASSES, pklfile_prefix_,
+                        submission_prefix_)
+                result_files[name] = result_files_
+        else:
+            result_files = self.bbox2result_kitti(outputs, self.CLASSES,
+                                                  pklfile_prefix,
+                                                  submission_prefix)
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric=None,
+                 logger=None,
+                 pklfile_prefix=None,
+                 submission_prefix=None,
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in KITTI protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str], optional): Metrics to be evaluated.
+                Defaults to None.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            pklfile_prefix (str, optional): The prefix of pkl files, including
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            submission_prefix (str, optional): The prefix of submission data.
+                If not specified, the submission data will not be generated.
+            show (bool, optional): Whether to visualize.
+                Default: False.
+            out_dir (str, optional): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        result_files, tmp_dir = self.format_results(results, pklfile_prefix)
+        from mmdet3d.core.evaluation import kitti_eval
+        gt_annos = [info['annos'] for info in self.anno_infos]
+
+        if isinstance(result_files, dict):
+            ap_dict = dict()
+            for name, result_files_ in result_files.items():
+                eval_types = ['bbox', 'bev', '3d']
+                if '2d' in name:
+                    eval_types = ['bbox']
+                ap_result_str, ap_dict_ = kitti_eval(
+                    gt_annos,
+                    result_files_,
+                    self.CLASSES,
+                    eval_types=eval_types)
+                for ap_type, ap in ap_dict_.items():
+                    ap_dict[f'{name}/{ap_type}'] = float('{:.4f}'.format(ap))
+
+                print_log(
+                    f'Results of {name}:\n' + ap_result_str, logger=logger)
+
+        else:
+            if metric == 'img_bbox2d':
+                ap_result_str, ap_dict = kitti_eval(
+                    gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
+            else:
+                ap_result_str, ap_dict = kitti_eval(gt_annos, result_files,
+                                                    self.CLASSES)
+            print_log('\n' + ap_result_str, logger=logger)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        if show or out_dir:
+            self.show(results, out_dir, show=show, pipeline=pipeline)
+        return ap_dict
+
+    def bbox2result_kitti(self,
+                          net_outputs,
+                          class_names,
+                          pklfile_prefix=None,
+                          submission_prefix=None):
+        """Convert 3D detection results to kitti format for evaluation and test
+        submission.
+
+        Args:
+            net_outputs (list[np.ndarray]): List of array storing the
+                inferenced bounding boxes and scores.
+            class_names (list[String]): A list of class names.
+            pklfile_prefix (str): The prefix of pkl file.
+            submission_prefix (str): The prefix of submission file.
+
+        Returns:
+            list[dict]: A list of dictionaries with the kitti format.
+        """
+        assert len(net_outputs) == len(self.anno_infos)
+        if submission_prefix is not None:
+            mmcv.mkdir_or_exist(submission_prefix)
+
+        det_annos = []
+        print('\nConverting prediction to KITTI format')
+        for idx, pred_dicts in enumerate(
+                mmcv.track_iter_progress(net_outputs)):
+            annos = []
+            info = self.anno_infos[idx]
+            sample_idx = info['image']['image_idx']
+            image_shape = info['image']['image_shape'][:2]
+
+            box_dict = self.convert_valid_bboxes(pred_dicts, info)
+            anno = {
+                'name': [],
+                'truncated': [],
+                'occluded': [],
+                'alpha': [],
+                'bbox': [],
+                'dimensions': [],
+                'location': [],
+                'rotation_y': [],
+                'score': []
+            }
+            if len(box_dict['bbox']) > 0:
+                box_2d_preds = box_dict['bbox']
+                box_preds = box_dict['box3d_camera']
+                scores = box_dict['scores']
+                box_preds_lidar = box_dict['box3d_lidar']
+                label_preds = box_dict['label_preds']
+
+                for box, box_lidar, bbox, score, label in zip(
+                        box_preds, box_preds_lidar, box_2d_preds, scores,
+                        label_preds):
+                    bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
+                    bbox[:2] = np.maximum(bbox[:2], [0, 0])
+                    anno['name'].append(class_names[int(label)])
+                    anno['truncated'].append(0.0)
+                    anno['occluded'].append(0)
+                    anno['alpha'].append(-np.arctan2(box[0], box[2]) + box[6])
+                    anno['bbox'].append(bbox)
+                    anno['dimensions'].append(box[3:6])
+                    anno['location'].append(box[:3])
+                    anno['rotation_y'].append(box[6])
+                    anno['score'].append(score)
+
+                anno = {k: np.stack(v) for k, v in anno.items()}
+                annos.append(anno)
+
+            else:
+                anno = {
+                    'name': np.array([]),
+                    'truncated': np.array([]),
+                    'occluded': np.array([]),
+                    'alpha': np.array([]),
+                    'bbox': np.zeros([0, 4]),
+                    'dimensions': np.zeros([0, 3]),
+                    'location': np.zeros([0, 3]),
+                    'rotation_y': np.array([]),
+                    'score': np.array([]),
+                }
+                annos.append(anno)
+
+            if submission_prefix is not None:
+                curr_file = f'{submission_prefix}/{sample_idx:06d}.txt'
+                with open(curr_file, 'w') as f:
+                    bbox = anno['bbox']
+                    loc = anno['location']
+                    dims = anno['dimensions']  # lhw -> hwl
+
+                    for idx in range(len(bbox)):
+                        print(
+                            '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
+                            '{:.4f} {:.4f} {:.4f} '
+                            '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(
+                                anno['name'][idx], anno['alpha'][idx],
+                                bbox[idx][0], bbox[idx][1], bbox[idx][2],
+                                bbox[idx][3], dims[idx][1], dims[idx][2],
+                                dims[idx][0], loc[idx][0], loc[idx][1],
+                                loc[idx][2], anno['rotation_y'][idx],
+                                anno['score'][idx]),
+                            file=f)
+
+            annos[-1]['sample_idx'] = np.array(
+                [sample_idx] * len(annos[-1]['score']), dtype=np.int64)
+
+            det_annos += annos
+
+        if pklfile_prefix is not None:
+            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
+                out = f'{pklfile_prefix}.pkl'
+            mmcv.dump(det_annos, out)
+            print('Result is saved to %s' % out)
+
+        return det_annos
+
+    def bbox2result_kitti2d(self,
+                            net_outputs,
+                            class_names,
+                            pklfile_prefix=None,
+                            submission_prefix=None):
+        """Convert 2D detection results to kitti format for evaluation and test
+        submission.
+
+        Args:
+            net_outputs (list[np.ndarray]): List of array storing the
+                inferenced bounding boxes and scores.
+            class_names (list[String]): A list of class names.
+            pklfile_prefix (str): The prefix of pkl file.
+            submission_prefix (str): The prefix of submission file.
+
+        Returns:
+            list[dict]: A list of dictionaries have the kitti format
+        """
+        assert len(net_outputs) == len(self.anno_infos)
+
+        det_annos = []
+        print('\nConverting prediction to KITTI format')
+        for i, bboxes_per_sample in enumerate(
+                mmcv.track_iter_progress(net_outputs)):
+            annos = []
+            anno = dict(
+                name=[],
+                truncated=[],
+                occluded=[],
+                alpha=[],
+                bbox=[],
+                dimensions=[],
+                location=[],
+                rotation_y=[],
+                score=[])
+            sample_idx = self.anno_infos[i]['image']['image_idx']
+
+            num_example = 0
+            for label in range(len(bboxes_per_sample)):
+                bbox = bboxes_per_sample[label]
+                for i in range(bbox.shape[0]):
+                    anno['name'].append(class_names[int(label)])
+                    anno['truncated'].append(0.0)
+                    anno['occluded'].append(0)
+                    anno['alpha'].append(-10)
+                    anno['bbox'].append(bbox[i, :4])
+                    # set dimensions (height, width, length) to zero
+                    anno['dimensions'].append(
+                        np.zeros(shape=[3], dtype=np.float32))
+                    # set the 3D translation to (-1000, -1000, -1000)
+                    anno['location'].append(
+                        np.ones(shape=[3], dtype=np.float32) * (-1000.0))
+                    anno['rotation_y'].append(0.0)
+                    anno['score'].append(bbox[i, 4])
+                    num_example += 1
+
+            if num_example == 0:
+                annos.append(
+                    dict(
+                        name=np.array([]),
+                        truncated=np.array([]),
+                        occluded=np.array([]),
+                        alpha=np.array([]),
+                        bbox=np.zeros([0, 4]),
+                        dimensions=np.zeros([0, 3]),
+                        location=np.zeros([0, 3]),
+                        rotation_y=np.array([]),
+                        score=np.array([]),
+                    ))
+            else:
+                anno = {k: np.stack(v) for k, v in anno.items()}
+                annos.append(anno)
+
+            annos[-1]['sample_idx'] = np.array(
+                [sample_idx] * num_example, dtype=np.int64)
+            det_annos += annos
+
+        if pklfile_prefix is not None:
+            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
+                out = f'{pklfile_prefix}.pkl'
+            mmcv.dump(det_annos, out)
+            print('Result is saved to %s' % out)
+
+        if submission_prefix is not None:
+            # save file in submission format
+            mmcv.mkdir_or_exist(submission_prefix)
+            print(f'Saving KITTI submission to {submission_prefix}')
+            for i, anno in enumerate(det_annos):
+                sample_idx = self.anno_infos[i]['image']['image_idx']
+                cur_det_file = f'{submission_prefix}/{sample_idx:06d}.txt'
+                with open(cur_det_file, 'w') as f:
+                    bbox = anno['bbox']
+                    loc = anno['location']
+                    dims = anno['dimensions'][::-1]  # lhw -> hwl
+                    for idx in range(len(bbox)):
+                        print(
+                            '{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} '
+                            '{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format(
+                                anno['name'][idx],
+                                anno['alpha'][idx],
+                                *bbox[idx],  # 4 float
+                                *dims[idx],  # 3 float
+                                *loc[idx],  # 3 float
+                                anno['rotation_y'][idx],
+                                anno['score'][idx]),
+                            file=f,
+                        )
+            print(f'Result is saved to {submission_prefix}')
+
+        return det_annos
+
+    def convert_valid_bboxes(self, box_dict, info):
+        """Convert the predicted boxes into valid ones.
+
+        Args:
+            box_dict (dict): Box dictionaries to be converted.
+                - boxes_3d (:obj:`CameraInstance3DBoxes`): 3D bounding boxes.
+                - scores_3d (torch.Tensor): Scores of boxes.
+                - labels_3d (torch.Tensor): Class labels of boxes.
+            info (dict): Data info.
+
+        Returns:
+            dict: Valid predicted boxes.
+                - bbox (np.ndarray): 2D bounding boxes.
+                - box3d_camera (np.ndarray): 3D bounding boxes in
+                    camera coordinate.
+                - scores (np.ndarray): Scores of boxes.
+                - label_preds (np.ndarray): Class label predictions.
+                - sample_idx (int): Sample index.
+        """
+        box_preds = box_dict['boxes_3d']
+        scores = box_dict['scores_3d']
+        labels = box_dict['labels_3d']
+        sample_idx = info['image']['image_idx']
+
+        if len(box_preds) == 0:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
+                sample_idx=sample_idx)
+
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        P2 = info['calib']['P2'].astype(np.float32)
+        img_shape = info['image']['image_shape']
+        P2 = box_preds.tensor.new_tensor(P2)
+
+        box_preds_camera = box_preds
+        box_preds_lidar = box_preds.convert_to(Box3DMode.LIDAR,
+                                               np.linalg.inv(rect @ Trv2c))
+
+        box_corners = box_preds_camera.corners
+        box_corners_in_image = points_cam2img(box_corners, P2)
+        # box_corners_in_image: [N, 8, 2]
+        minxy = torch.min(box_corners_in_image, dim=1)[0]
+        maxxy = torch.max(box_corners_in_image, dim=1)[0]
+        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
+        # Post-processing
+        # check box_preds_camera
+        image_shape = box_preds.tensor.new_tensor(img_shape)
+        valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
+                          (box_2d_preds[:, 1] < image_shape[0]) &
+                          (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
+        # check box_preds
+        valid_inds = valid_cam_inds
+
+        if valid_inds.sum() > 0:
+            return dict(
+                bbox=box_2d_preds[valid_inds, :].numpy(),
+                box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
+                box3d_lidar=box_preds_lidar[valid_inds].tensor.numpy(),
+                scores=scores[valid_inds].numpy(),
+                label_preds=labels[valid_inds].numpy(),
+                sample_idx=sample_idx)
+        else:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
+                sample_idx=sample_idx)
diff --git a/mmdet3d/datasets/lyft_dataset.py b/mmdet3d/datasets/lyft_dataset.py
index 031d86a..60f5cbf 100644
--- a/mmdet3d/datasets/lyft_dataset.py
+++ b/mmdet3d/datasets/lyft_dataset.py
@@ -1,567 +1,567 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import tempfile
-from os import path as osp
-
-import mmcv
-import numpy as np
-import pandas as pd
-from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
-from lyft_dataset_sdk.utils.data_classes import Box as LyftBox
-from pyquaternion import Quaternion
-
-from mmdet3d.core.evaluation.lyft_eval import lyft_eval
-from ..core import show_result
-from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes
-from .builder import DATASETS
-from .custom_3d import Custom3DDataset
-from .pipelines import Compose
-
-
-@DATASETS.register_module()
-class LyftDataset(Custom3DDataset):
-    r"""Lyft Dataset.
-
-    This class serves as the API for experiments on the Lyft Dataset.
-
-    Please refer to
-    `<https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data>`_
-    for data downloading.
-
-    Args:
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        data_root (str): Path of dataset root.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        load_interval (int, optional): Interval of loading the dataset. It is
-            used to uniformly sample the dataset. Defaults to 1.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR' in this dataset. Available options includes
-
-            - 'LiDAR': Box in LiDAR coordinates.
-            - 'Depth': Box in depth coordinates, usually for indoor dataset.
-            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-    """  # noqa: E501
-    NameMapping = {
-        'bicycle': 'bicycle',
-        'bus': 'bus',
-        'car': 'car',
-        'emergency_vehicle': 'emergency_vehicle',
-        'motorcycle': 'motorcycle',
-        'other_vehicle': 'other_vehicle',
-        'pedestrian': 'pedestrian',
-        'truck': 'truck',
-        'animal': 'animal'
-    }
-    DefaultAttribute = {
-        'car': 'is_stationary',
-        'truck': 'is_stationary',
-        'bus': 'is_stationary',
-        'emergency_vehicle': 'is_stationary',
-        'other_vehicle': 'is_stationary',
-        'motorcycle': 'is_stationary',
-        'bicycle': 'is_stationary',
-        'pedestrian': 'is_stationary',
-        'animal': 'is_stationary'
-    }
-    CLASSES = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
-               'motorcycle', 'bicycle', 'pedestrian', 'animal')
-
-    def __init__(self,
-                 ann_file,
-                 pipeline=None,
-                 data_root=None,
-                 classes=None,
-                 load_interval=1,
-                 modality=None,
-                 box_type_3d='LiDAR',
-                 filter_empty_gt=True,
-                 test_mode=False,
-                 **kwargs):
-        self.load_interval = load_interval
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_file,
-            pipeline=pipeline,
-            classes=classes,
-            modality=modality,
-            box_type_3d=box_type_3d,
-            filter_empty_gt=filter_empty_gt,
-            test_mode=test_mode,
-            **kwargs)
-
-        if self.modality is None:
-            self.modality = dict(
-                use_camera=False,
-                use_lidar=True,
-                use_radar=False,
-                use_map=False,
-                use_external=False,
-            )
-
-    def load_annotations(self, ann_file):
-        """Load annotations from ann_file.
-
-        Args:
-            ann_file (str): Path of the annotation file.
-
-        Returns:
-            list[dict]: List of annotations sorted by timestamps.
-        """
-        # loading data from a file-like object needs file format
-        data = mmcv.load(ann_file, file_format='pkl')
-        data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp']))
-        data_infos = data_infos[::self.load_interval]
-        self.metadata = data['metadata']
-        self.version = self.metadata['version']
-        return data_infos
-
-    def get_data_info(self, index):
-        """Get data info according to the given index.
-
-        Args:
-            index (int): Index of the sample data to get.
-
-        Returns:
-            dict: Data information that will be passed to the data
-                preprocessing pipelines. It includes the following keys:
-
-                - sample_idx (str): sample index
-                - pts_filename (str): filename of point clouds
-                - sweeps (list[dict]): infos of sweeps
-                - timestamp (float): sample timestamp
-                - img_filename (str, optional): image filename
-                - lidar2img (list[np.ndarray], optional): transformations
-                    from lidar to different cameras
-                - ann_info (dict): annotation info
-        """
-        info = self.data_infos[index]
-
-        # standard protocol modified from SECOND.Pytorch
-        input_dict = dict(
-            sample_idx=info['token'],
-            pts_filename=info['lidar_path'],
-            sweeps=info['sweeps'],
-            timestamp=info['timestamp'] / 1e6,
-        )
-
-        if self.modality['use_camera']:
-            image_paths = []
-            lidar2img_rts = []
-            for cam_type, cam_info in info['cams'].items():
-                image_paths.append(cam_info['data_path'])
-                # obtain lidar to image transformation matrix
-                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
-                lidar2cam_t = cam_info[
-                    'sensor2lidar_translation'] @ lidar2cam_r.T
-                lidar2cam_rt = np.eye(4)
-                lidar2cam_rt[:3, :3] = lidar2cam_r.T
-                lidar2cam_rt[3, :3] = -lidar2cam_t
-                intrinsic = cam_info['cam_intrinsic']
-                viewpad = np.eye(4)
-                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
-                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
-                lidar2img_rts.append(lidar2img_rt)
-
-            input_dict.update(
-                dict(
-                    img_filename=image_paths,
-                    lidar2img=lidar2img_rts,
-                ))
-
-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-
-        return input_dict
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: Annotation information consists of the following keys:
-
-                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
-                    3D ground truth bboxes.
-                - gt_labels_3d (np.ndarray): Labels of ground truths.
-                - gt_names (list[str]): Class names of ground truths.
-        """
-        info = self.data_infos[index]
-        gt_bboxes_3d = info['gt_boxes']
-        gt_names_3d = info['gt_names']
-        gt_labels_3d = []
-        for cat in gt_names_3d:
-            if cat in self.CLASSES:
-                gt_labels_3d.append(self.CLASSES.index(cat))
-            else:
-                gt_labels_3d.append(-1)
-        gt_labels_3d = np.array(gt_labels_3d)
-
-        if 'gt_shape' in info:
-            gt_shape = info['gt_shape']
-            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_shape], axis=-1)
-
-        # the lyft box center is [0.5, 0.5, 0.5], we change it to be
-        # the same as KITTI (0.5, 0.5, 0)
-        gt_bboxes_3d = LiDARInstance3DBoxes(
-            gt_bboxes_3d,
-            box_dim=gt_bboxes_3d.shape[-1],
-            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
-
-        anns_results = dict(
-            gt_bboxes_3d=gt_bboxes_3d,
-            gt_labels_3d=gt_labels_3d,
-        )
-        return anns_results
-
-    def _format_bbox(self, results, jsonfile_prefix=None):
-        """Convert the results to the standard format.
-
-        Args:
-            results (list[dict]): Testing results of the dataset.
-            jsonfile_prefix (str): The prefix of the output jsonfile.
-                You can specify the output directory/filename by
-                modifying the jsonfile_prefix. Default: None.
-
-        Returns:
-            str: Path of the output json file.
-        """
-        lyft_annos = {}
-        mapped_class_names = self.CLASSES
-
-        print('Start to convert detection format...')
-        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
-            annos = []
-            boxes = output_to_lyft_box(det)
-            sample_token = self.data_infos[sample_id]['token']
-            boxes = lidar_lyft_box_to_global(self.data_infos[sample_id], boxes)
-            for i, box in enumerate(boxes):
-                name = mapped_class_names[box.label]
-                lyft_anno = dict(
-                    sample_token=sample_token,
-                    translation=box.center.tolist(),
-                    size=box.wlh.tolist(),
-                    rotation=box.orientation.elements.tolist(),
-                    name=name,
-                    score=box.score)
-                annos.append(lyft_anno)
-            lyft_annos[sample_token] = annos
-        lyft_submissions = {
-            'meta': self.modality,
-            'results': lyft_annos,
-        }
-
-        mmcv.mkdir_or_exist(jsonfile_prefix)
-        res_path = osp.join(jsonfile_prefix, 'results_lyft.json')
-        print('Results writes to', res_path)
-        mmcv.dump(lyft_submissions, res_path)
-        return res_path
-
-    def _evaluate_single(self,
-                         result_path,
-                         logger=None,
-                         metric='bbox',
-                         result_name='pts_bbox'):
-        """Evaluation for a single model in Lyft protocol.
-
-        Args:
-            result_path (str): Path of the result file.
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Default: None.
-            metric (str, optional): Metric name used for evaluation.
-                Default: 'bbox'.
-            result_name (str, optional): Result name in the metric prefix.
-                Default: 'pts_bbox'.
-
-        Returns:
-            dict: Dictionary of evaluation details.
-        """
-
-        output_dir = osp.join(*osp.split(result_path)[:-1])
-        lyft = Lyft(
-            data_path=osp.join(self.data_root, self.version),
-            json_path=osp.join(self.data_root, self.version, self.version),
-            verbose=True)
-        eval_set_map = {
-            'v1.01-train': 'val',
-        }
-        metrics = lyft_eval(lyft, self.data_root, result_path,
-                            eval_set_map[self.version], output_dir, logger)
-
-        # record metrics
-        detail = dict()
-        metric_prefix = f'{result_name}_Lyft'
-
-        for i, name in enumerate(metrics['class_names']):
-            AP = float(metrics['mAPs_cate'][i])
-            detail[f'{metric_prefix}/{name}_AP'] = AP
-
-        detail[f'{metric_prefix}/mAP'] = metrics['Final mAP']
-        return detail
-
-    def format_results(self, results, jsonfile_prefix=None, csv_savepath=None):
-        """Format the results to json (standard format for COCO evaluation).
-
-        Args:
-            results (list[dict]): Testing results of the dataset.
-            jsonfile_prefix (str): The prefix of json files. It includes
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-            csv_savepath (str): The path for saving csv files.
-                It includes the file path and the csv filename,
-                e.g., "a/b/filename.csv". If not specified,
-                the result will not be converted to csv file.
-
-        Returns:
-            tuple: Returns (result_files, tmp_dir), where `result_files` is a
-                dict containing the json filepaths, `tmp_dir` is the temporal
-                directory created for saving json files when
-                `jsonfile_prefix` is not specified.
-        """
-        assert isinstance(results, list), 'results must be a list'
-        assert len(results) == len(self), (
-            'The length of results is not equal to the dataset len: {} != {}'.
-            format(len(results), len(self)))
-
-        if jsonfile_prefix is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
-        else:
-            tmp_dir = None
-
-        # currently the output prediction results could be in two formats
-        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
-        # 2. list of dict('pts_bbox' or 'img_bbox':
-        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
-        # this is a workaround to enable evaluation of both formats on Lyft
-        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
-        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
-            result_files = self._format_bbox(results, jsonfile_prefix)
-        else:
-            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
-            result_files = dict()
-            for name in results[0]:
-                print(f'\nFormating bboxes of {name}')
-                results_ = [out[name] for out in results]
-                tmp_file_ = osp.join(jsonfile_prefix, name)
-                result_files.update(
-                    {name: self._format_bbox(results_, tmp_file_)})
-        if csv_savepath is not None:
-            self.json2csv(result_files['pts_bbox'], csv_savepath)
-        return result_files, tmp_dir
-
-    def evaluate(self,
-                 results,
-                 metric='bbox',
-                 logger=None,
-                 jsonfile_prefix=None,
-                 csv_savepath=None,
-                 result_names=['pts_bbox'],
-                 show=False,
-                 out_dir=None,
-                 pipeline=None):
-        """Evaluation in Lyft protocol.
-
-        Args:
-            results (list[dict]): Testing results of the dataset.
-            metric (str | list[str], optional): Metrics to be evaluated.
-                Default: 'bbox'.
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Default: None.
-            jsonfile_prefix (str, optional): The prefix of json files including
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-            csv_savepath (str, optional): The path for saving csv files.
-                It includes the file path and the csv filename,
-                e.g., "a/b/filename.csv". If not specified,
-                the result will not be converted to csv file.
-            result_names (list[str], optional): Result names in the
-                metric prefix. Default: ['pts_bbox'].
-            show (bool, optional): Whether to visualize.
-                Default: False.
-            out_dir (str, optional): Path to save the visualization results.
-                Default: None.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-
-        Returns:
-            dict[str, float]: Evaluation results.
-        """
-        result_files, tmp_dir = self.format_results(results, jsonfile_prefix,
-                                                    csv_savepath)
-
-        if isinstance(result_files, dict):
-            results_dict = dict()
-            for name in result_names:
-                print(f'Evaluating bboxes of {name}')
-                ret_dict = self._evaluate_single(result_files[name])
-            results_dict.update(ret_dict)
-        elif isinstance(result_files, str):
-            results_dict = self._evaluate_single(result_files)
-
-        if tmp_dir is not None:
-            tmp_dir.cleanup()
-
-        if show or out_dir:
-            self.show(results, out_dir, show=show, pipeline=pipeline)
-        return results_dict
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        pipeline = [
-            dict(
-                type='LoadPointsFromFile',
-                coord_type='LIDAR',
-                load_dim=5,
-                use_dim=5,
-                file_client_args=dict(backend='disk')),
-            dict(
-                type='LoadPointsFromMultiSweeps',
-                sweeps_num=10,
-                file_client_args=dict(backend='disk')),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=self.CLASSES,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ]
-        return Compose(pipeline)
-
-    def show(self, results, out_dir, show=False, pipeline=None):
-        """Results visualization.
-
-        Args:
-            results (list[dict]): List of bounding boxes results.
-            out_dir (str): Output directory of visualization result.
-            show (bool): Whether to visualize the results online.
-                Default: False.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-        """
-        assert out_dir is not None, 'Expect out_dir, got none.'
-        pipeline = self._get_pipeline(pipeline)
-        for i, result in enumerate(results):
-            if 'pts_bbox' in result.keys():
-                result = result['pts_bbox']
-            data_info = self.data_infos[i]
-            pts_path = data_info['lidar_path']
-            file_name = osp.split(pts_path)[-1].split('.')[0]
-            points = self._extract_data(i, pipeline, 'points').numpy()
-            points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
-                                               Coord3DMode.DEPTH)
-            inds = result['scores_3d'] > 0.1
-            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
-            show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
-                                               Box3DMode.DEPTH)
-            pred_bboxes = result['boxes_3d'][inds].tensor.numpy()
-            show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
-                                                 Box3DMode.DEPTH)
-            show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir,
-                        file_name, show)
-
-    def json2csv(self, json_path, csv_savepath):
-        """Convert the json file to csv format for submission.
-
-        Args:
-            json_path (str): Path of the result json file.
-            csv_savepath (str): Path to save the csv file.
-        """
-        results = mmcv.load(json_path)['results']
-        sample_list_path = osp.join(self.data_root, 'sample_submission.csv')
-        data = pd.read_csv(sample_list_path)
-        Id_list = list(data['Id'])
-        pred_list = list(data['PredictionString'])
-        cnt = 0
-        print('Converting the json to csv...')
-        for token in results.keys():
-            cnt += 1
-            predictions = results[token]
-            prediction_str = ''
-            for i in range(len(predictions)):
-                prediction_str += \
-                    str(predictions[i]['score']) + ' ' + \
-                    str(predictions[i]['translation'][0]) + ' ' + \
-                    str(predictions[i]['translation'][1]) + ' ' + \
-                    str(predictions[i]['translation'][2]) + ' ' + \
-                    str(predictions[i]['size'][0]) + ' ' + \
-                    str(predictions[i]['size'][1]) + ' ' + \
-                    str(predictions[i]['size'][2]) + ' ' + \
-                    str(Quaternion(list(predictions[i]['rotation']))
-                        .yaw_pitch_roll[0]) + ' ' + \
-                    predictions[i]['name'] + ' '
-            prediction_str = prediction_str[:-1]
-            idx = Id_list.index(token)
-            pred_list[idx] = prediction_str
-        df = pd.DataFrame({'Id': Id_list, 'PredictionString': pred_list})
-        mmcv.mkdir_or_exist(os.path.dirname(csv_savepath))
-        df.to_csv(csv_savepath, index=False)
-
-
-def output_to_lyft_box(detection):
-    """Convert the output to the box class in the Lyft.
-
-    Args:
-        detection (dict): Detection results.
-
-    Returns:
-        list[:obj:`LyftBox`]: List of standard LyftBoxes.
-    """
-    box3d = detection['boxes_3d']
-    scores = detection['scores_3d'].numpy()
-    labels = detection['labels_3d'].numpy()
-
-    box_gravity_center = box3d.gravity_center.numpy()
-    box_dims = box3d.dims.numpy()
-    box_yaw = box3d.yaw.numpy()
-
-    # our LiDAR coordinate system -> Lyft box coordinate system
-    lyft_box_dims = box_dims[:, [1, 0, 2]]
-
-    box_list = []
-    for i in range(len(box3d)):
-        quat = Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
-        box = LyftBox(
-            box_gravity_center[i],
-            lyft_box_dims[i],
-            quat,
-            label=labels[i],
-            score=scores[i])
-        box_list.append(box)
-    return box_list
-
-
-def lidar_lyft_box_to_global(info, boxes):
-    """Convert the box from ego to global coordinate.
-
-    Args:
-        info (dict): Info for a specific sample data, including the
-            calibration information.
-        boxes (list[:obj:`LyftBox`]): List of predicted LyftBoxes.
-
-    Returns:
-        list: List of standard LyftBoxes in the global
-            coordinate.
-    """
-    box_list = []
-    for box in boxes:
-        # Move box to ego vehicle coord system
-        box.rotate(Quaternion(info['lidar2ego_rotation']))
-        box.translate(np.array(info['lidar2ego_translation']))
-        # Move box to global coord system
-        box.rotate(Quaternion(info['ego2global_rotation']))
-        box.translate(np.array(info['ego2global_translation']))
-        box_list.append(box)
-    return box_list
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+from os import path as osp
+
+import mmcv
+import numpy as np
+import pandas as pd
+from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
+from lyft_dataset_sdk.utils.data_classes import Box as LyftBox
+from pyquaternion import Quaternion
+
+from mmdet3d.core.evaluation.lyft_eval import lyft_eval
+from ..core import show_result
+from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes
+from .builder import DATASETS
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class LyftDataset(Custom3DDataset):
+    r"""Lyft Dataset.
+
+    This class serves as the API for experiments on the Lyft Dataset.
+
+    Please refer to
+    `<https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data>`_
+    for data downloading.
+
+    Args:
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        data_root (str): Path of dataset root.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """  # noqa: E501
+    NameMapping = {
+        'bicycle': 'bicycle',
+        'bus': 'bus',
+        'car': 'car',
+        'emergency_vehicle': 'emergency_vehicle',
+        'motorcycle': 'motorcycle',
+        'other_vehicle': 'other_vehicle',
+        'pedestrian': 'pedestrian',
+        'truck': 'truck',
+        'animal': 'animal'
+    }
+    DefaultAttribute = {
+        'car': 'is_stationary',
+        'truck': 'is_stationary',
+        'bus': 'is_stationary',
+        'emergency_vehicle': 'is_stationary',
+        'other_vehicle': 'is_stationary',
+        'motorcycle': 'is_stationary',
+        'bicycle': 'is_stationary',
+        'pedestrian': 'is_stationary',
+        'animal': 'is_stationary'
+    }
+    CLASSES = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+               'motorcycle', 'bicycle', 'pedestrian', 'animal')
+
+    def __init__(self,
+                 ann_file,
+                 pipeline=None,
+                 data_root=None,
+                 classes=None,
+                 load_interval=1,
+                 modality=None,
+                 box_type_3d='LiDAR',
+                 filter_empty_gt=True,
+                 test_mode=False,
+                 **kwargs):
+        self.load_interval = load_interval
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+
+        if self.modality is None:
+            self.modality = dict(
+                use_camera=False,
+                use_lidar=True,
+                use_radar=False,
+                use_map=False,
+                use_external=False,
+            )
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations sorted by timestamps.
+        """
+        # loading data from a file-like object needs file format
+        data = mmcv.load(ann_file, file_format='pkl')
+        data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp']))
+        data_infos = data_infos[::self.load_interval]
+        self.metadata = data['metadata']
+        self.version = self.metadata['version']
+        return data_infos
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): sample index
+                - pts_filename (str): filename of point clouds
+                - sweeps (list[dict]): infos of sweeps
+                - timestamp (float): sample timestamp
+                - img_filename (str, optional): image filename
+                - lidar2img (list[np.ndarray], optional): transformations
+                    from lidar to different cameras
+                - ann_info (dict): annotation info
+        """
+        info = self.data_infos[index]
+
+        # standard protocol modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            sweeps=info['sweeps'],
+            timestamp=info['timestamp'] / 1e6,
+        )
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                ))
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        return input_dict
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                    3D ground truth bboxes.
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+        """
+        info = self.data_infos[index]
+        gt_bboxes_3d = info['gt_boxes']
+        gt_names_3d = info['gt_names']
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        if 'gt_shape' in info:
+            gt_shape = info['gt_shape']
+            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_shape], axis=-1)
+
+        # the lyft box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+        )
+        return anns_results
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        lyft_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+            annos = []
+            boxes = output_to_lyft_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+            boxes = lidar_lyft_box_to_global(self.data_infos[sample_id], boxes)
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                lyft_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    name=name,
+                    score=box.score)
+                annos.append(lyft_anno)
+            lyft_annos[sample_token] = annos
+        lyft_submissions = {
+            'meta': self.modality,
+            'results': lyft_annos,
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_lyft.json')
+        print('Results writes to', res_path)
+        mmcv.dump(lyft_submissions, res_path)
+        return res_path
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in Lyft protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str, optional): Metric name used for evaluation.
+                Default: 'bbox'.
+            result_name (str, optional): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        lyft = Lyft(
+            data_path=osp.join(self.data_root, self.version),
+            json_path=osp.join(self.data_root, self.version, self.version),
+            verbose=True)
+        eval_set_map = {
+            'v1.01-train': 'val',
+        }
+        metrics = lyft_eval(lyft, self.data_root, result_path,
+                            eval_set_map[self.version], output_dir, logger)
+
+        # record metrics
+        detail = dict()
+        metric_prefix = f'{result_name}_Lyft'
+
+        for i, name in enumerate(metrics['class_names']):
+            AP = float(metrics['mAPs_cate'][i])
+            detail[f'{metric_prefix}/{name}_AP'] = AP
+
+        detail[f'{metric_prefix}/mAP'] = metrics['Final mAP']
+        return detail
+
+    def format_results(self, results, jsonfile_prefix=None, csv_savepath=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            csv_savepath (str): The path for saving csv files.
+                It includes the file path and the csv filename,
+                e.g., "a/b/filename.csv". If not specified,
+                the result will not be converted to csv file.
+
+        Returns:
+            tuple: Returns (result_files, tmp_dir), where `result_files` is a
+                dict containing the json filepaths, `tmp_dir` is the temporal
+                directory created for saving json files when
+                `jsonfile_prefix` is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        # currently the output prediction results could be in two formats
+        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+        # 2. list of dict('pts_bbox' or 'img_bbox':
+        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+        # this is a workaround to enable evaluation of both formats on Lyft
+        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+        if csv_savepath is not None:
+            self.json2csv(result_files['pts_bbox'], csv_savepath)
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 csv_savepath=None,
+                 result_names=['pts_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in Lyft protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str], optional): Metrics to be evaluated.
+                Default: 'bbox'.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str, optional): The prefix of json files including
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            csv_savepath (str, optional): The path for saving csv files.
+                It includes the file path and the csv filename,
+                e.g., "a/b/filename.csv". If not specified,
+                the result will not be converted to csv file.
+            result_names (list[str], optional): Result names in the
+                metric prefix. Default: ['pts_bbox'].
+            show (bool, optional): Whether to visualize.
+                Default: False.
+            out_dir (str, optional): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Evaluation results.
+        """
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix,
+                                                    csv_savepath)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print(f'Evaluating bboxes of {name}')
+                ret_dict = self._evaluate_single(result_files[name])
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show or out_dir:
+            self.show(results, out_dir, show=show, pipeline=pipeline)
+        return results_dict
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='LIDAR',
+                load_dim=5,
+                use_dim=5,
+                file_client_args=dict(backend='disk')),
+            dict(
+                type='LoadPointsFromMultiSweeps',
+                sweeps_num=10,
+                file_client_args=dict(backend='disk')),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ]
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=False, pipeline=None):
+        """Results visualization.
+
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Whether to visualize the results online.
+                Default: False.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            if 'pts_bbox' in result.keys():
+                result = result['pts_bbox']
+            data_info = self.data_infos[i]
+            pts_path = data_info['lidar_path']
+            file_name = osp.split(pts_path)[-1].split('.')[0]
+            points = self._extract_data(i, pipeline, 'points').numpy()
+            points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
+                                               Coord3DMode.DEPTH)
+            inds = result['scores_3d'] > 0.1
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
+            show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
+                                               Box3DMode.DEPTH)
+            pred_bboxes = result['boxes_3d'][inds].tensor.numpy()
+            show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
+                                                 Box3DMode.DEPTH)
+            show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir,
+                        file_name, show)
+
+    def json2csv(self, json_path, csv_savepath):
+        """Convert the json file to csv format for submission.
+
+        Args:
+            json_path (str): Path of the result json file.
+            csv_savepath (str): Path to save the csv file.
+        """
+        results = mmcv.load(json_path)['results']
+        sample_list_path = osp.join(self.data_root, 'sample_submission.csv')
+        data = pd.read_csv(sample_list_path)
+        Id_list = list(data['Id'])
+        pred_list = list(data['PredictionString'])
+        cnt = 0
+        print('Converting the json to csv...')
+        for token in results.keys():
+            cnt += 1
+            predictions = results[token]
+            prediction_str = ''
+            for i in range(len(predictions)):
+                prediction_str += \
+                    str(predictions[i]['score']) + ' ' + \
+                    str(predictions[i]['translation'][0]) + ' ' + \
+                    str(predictions[i]['translation'][1]) + ' ' + \
+                    str(predictions[i]['translation'][2]) + ' ' + \
+                    str(predictions[i]['size'][0]) + ' ' + \
+                    str(predictions[i]['size'][1]) + ' ' + \
+                    str(predictions[i]['size'][2]) + ' ' + \
+                    str(Quaternion(list(predictions[i]['rotation']))
+                        .yaw_pitch_roll[0]) + ' ' + \
+                    predictions[i]['name'] + ' '
+            prediction_str = prediction_str[:-1]
+            idx = Id_list.index(token)
+            pred_list[idx] = prediction_str
+        df = pd.DataFrame({'Id': Id_list, 'PredictionString': pred_list})
+        mmcv.mkdir_or_exist(os.path.dirname(csv_savepath))
+        df.to_csv(csv_savepath, index=False)
+
+
+def output_to_lyft_box(detection):
+    """Convert the output to the box class in the Lyft.
+
+    Args:
+        detection (dict): Detection results.
+
+    Returns:
+        list[:obj:`LyftBox`]: List of standard LyftBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+
+    # our LiDAR coordinate system -> Lyft box coordinate system
+    lyft_box_dims = box_dims[:, [1, 0, 2]]
+
+    box_list = []
+    for i in range(len(box3d)):
+        quat = Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        box = LyftBox(
+            box_gravity_center[i],
+            lyft_box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i])
+        box_list.append(box)
+    return box_list
+
+
+def lidar_lyft_box_to_global(info, boxes):
+    """Convert the box from ego to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`LyftBox`]): List of predicted LyftBoxes.
+
+    Returns:
+        list: List of standard LyftBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.rotate(Quaternion(info['lidar2ego_rotation']))
+        box.translate(np.array(info['lidar2ego_translation']))
+        # Move box to global coord system
+        box.rotate(Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+    return box_list
diff --git a/mmdet3d/datasets/nuscenes_dataset.py b/mmdet3d/datasets/nuscenes_dataset.py
index 1ca8265..70b34d5 100644
--- a/mmdet3d/datasets/nuscenes_dataset.py
+++ b/mmdet3d/datasets/nuscenes_dataset.py
@@ -1,654 +1,654 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import tempfile
-from os import path as osp
-
-import mmcv
-import numpy as np
-import pyquaternion
-from nuscenes.utils.data_classes import Box as NuScenesBox
-
-from ..core import show_result
-from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes
-from .builder import DATASETS
-from .custom_3d import Custom3DDataset
-from .pipelines import Compose
-
-
-@DATASETS.register_module()
-class NuScenesDataset(Custom3DDataset):
-    r"""NuScenes Dataset.
-
-    This class serves as the API for experiments on the NuScenes Dataset.
-
-    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
-    for data downloading.
-
-    Args:
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        data_root (str): Path of dataset root.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        load_interval (int, optional): Interval of loading the dataset. It is
-            used to uniformly sample the dataset. Defaults to 1.
-        with_velocity (bool, optional): Whether include velocity prediction
-            into the experiments. Defaults to True.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR' in this dataset. Available options includes.
-            - 'LiDAR': Box in LiDAR coordinates.
-            - 'Depth': Box in depth coordinates, usually for indoor dataset.
-            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-        eval_version (bool, optional): Configuration version of evaluation.
-            Defaults to  'detection_cvpr_2019'.
-        use_valid_flag (bool, optional): Whether to use `use_valid_flag` key
-            in the info file as mask to filter gt_boxes and gt_names.
-            Defaults to False.
-    """
-    NameMapping = {
-        'movable_object.barrier': 'barrier',
-        'vehicle.bicycle': 'bicycle',
-        'vehicle.bus.bendy': 'bus',
-        'vehicle.bus.rigid': 'bus',
-        'vehicle.car': 'car',
-        'vehicle.construction': 'construction_vehicle',
-        'vehicle.motorcycle': 'motorcycle',
-        'human.pedestrian.adult': 'pedestrian',
-        'human.pedestrian.child': 'pedestrian',
-        'human.pedestrian.construction_worker': 'pedestrian',
-        'human.pedestrian.police_officer': 'pedestrian',
-        'movable_object.trafficcone': 'traffic_cone',
-        'vehicle.trailer': 'trailer',
-        'vehicle.truck': 'truck'
-    }
-    DefaultAttribute = {
-        'car': 'vehicle.parked',
-        'pedestrian': 'pedestrian.moving',
-        'trailer': 'vehicle.parked',
-        'truck': 'vehicle.parked',
-        'bus': 'vehicle.moving',
-        'motorcycle': 'cycle.without_rider',
-        'construction_vehicle': 'vehicle.parked',
-        'bicycle': 'cycle.without_rider',
-        'barrier': '',
-        'traffic_cone': '',
-    }
-    AttrMapping = {
-        'cycle.with_rider': 0,
-        'cycle.without_rider': 1,
-        'pedestrian.moving': 2,
-        'pedestrian.standing': 3,
-        'pedestrian.sitting_lying_down': 4,
-        'vehicle.moving': 5,
-        'vehicle.parked': 6,
-        'vehicle.stopped': 7,
-    }
-    AttrMapping_rev = [
-        'cycle.with_rider',
-        'cycle.without_rider',
-        'pedestrian.moving',
-        'pedestrian.standing',
-        'pedestrian.sitting_lying_down',
-        'vehicle.moving',
-        'vehicle.parked',
-        'vehicle.stopped',
-    ]
-    # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
-    ErrNameMapping = {
-        'trans_err': 'mATE',
-        'scale_err': 'mASE',
-        'orient_err': 'mAOE',
-        'vel_err': 'mAVE',
-        'attr_err': 'mAAE'
-    }
-    CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
-               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
-               'barrier')
-
-    def __init__(self,
-                 ann_file,
-                 pipeline=None,
-                 data_root=None,
-                 classes=None,
-                 load_interval=1,
-                 with_velocity=True,
-                 modality=None,
-                 box_type_3d='LiDAR',
-                 filter_empty_gt=True,
-                 test_mode=False,
-                 eval_version='detection_cvpr_2019',
-                 use_valid_flag=False):
-        self.load_interval = load_interval
-        self.use_valid_flag = use_valid_flag
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_file,
-            pipeline=pipeline,
-            classes=classes,
-            modality=modality,
-            box_type_3d=box_type_3d,
-            filter_empty_gt=filter_empty_gt,
-            test_mode=test_mode)
-
-        self.with_velocity = with_velocity
-        self.eval_version = eval_version
-        from nuscenes.eval.detection.config import config_factory
-        self.eval_detection_configs = config_factory(self.eval_version)
-        if self.modality is None:
-            self.modality = dict(
-                use_camera=False,
-                use_lidar=True,
-                use_radar=False,
-                use_map=False,
-                use_external=False,
-            )
-
-    def get_cat_ids(self, idx):
-        """Get category distribution of single scene.
-
-        Args:
-            idx (int): Index of the data_info.
-
-        Returns:
-            dict[list]: for each category, if the current scene
-                contains such boxes, store a list containing idx,
-                otherwise, store empty list.
-        """
-        info = self.data_infos[idx]
-        if self.use_valid_flag:
-            mask = info['valid_flag']
-            gt_names = set(info['gt_names'][mask])
-        else:
-            gt_names = set(info['gt_names'])
-
-        cat_ids = []
-        for name in gt_names:
-            if name in self.CLASSES:
-                cat_ids.append(self.cat2id[name])
-        return cat_ids
-
-    def load_annotations(self, ann_file):
-        """Load annotations from ann_file.
-
-        Args:
-            ann_file (str): Path of the annotation file.
-
-        Returns:
-            list[dict]: List of annotations sorted by timestamps.
-        """
-        data = mmcv.load(ann_file, file_format='pkl')
-        data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp']))
-        data_infos = data_infos[::self.load_interval]
-        self.metadata = data['metadata']
-        self.version = self.metadata['version']
-        return data_infos
-
-    def get_data_info(self, index):
-        """Get data info according to the given index.
-
-        Args:
-            index (int): Index of the sample data to get.
-
-        Returns:
-            dict: Data information that will be passed to the data
-                preprocessing pipelines. It includes the following keys:
-
-                - sample_idx (str): Sample index.
-                - pts_filename (str): Filename of point clouds.
-                - sweeps (list[dict]): Infos of sweeps.
-                - timestamp (float): Sample timestamp.
-                - img_filename (str, optional): Image filename.
-                - lidar2img (list[np.ndarray], optional): Transformations
-                    from lidar to different cameras.
-                - ann_info (dict): Annotation info.
-        """
-        info = self.data_infos[index]
-        # standard protocol modified from SECOND.Pytorch
-        input_dict = dict(
-            sample_idx=info['token'],
-            pts_filename=info['lidar_path'],
-            sweeps=info['sweeps'],
-            timestamp=info['timestamp'] / 1e6,
-        )
-
-        if self.modality['use_camera']:
-            image_paths = []
-            lidar2img_rts = []
-            for cam_type, cam_info in info['cams'].items():
-                image_paths.append(cam_info['data_path'])
-                # obtain lidar to image transformation matrix
-                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
-                lidar2cam_t = cam_info[
-                    'sensor2lidar_translation'] @ lidar2cam_r.T
-                lidar2cam_rt = np.eye(4)
-                lidar2cam_rt[:3, :3] = lidar2cam_r.T
-                lidar2cam_rt[3, :3] = -lidar2cam_t
-                intrinsic = cam_info['cam_intrinsic']
-                viewpad = np.eye(4)
-                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
-                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
-                lidar2img_rts.append(lidar2img_rt)
-
-            input_dict.update(
-                dict(
-                    img_filename=image_paths,
-                    lidar2img=lidar2img_rts,
-                ))
-
-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-
-        return input_dict
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: Annotation information consists of the following keys:
-
-                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
-                    3D ground truth bboxes
-                - gt_labels_3d (np.ndarray): Labels of ground truths.
-                - gt_names (list[str]): Class names of ground truths.
-        """
-        info = self.data_infos[index]
-        # filter out bbox containing no points
-        if self.use_valid_flag:
-            mask = info['valid_flag']
-        else:
-            mask = info['num_lidar_pts'] > 0
-        gt_bboxes_3d = info['gt_boxes'][mask]
-        gt_names_3d = info['gt_names'][mask]
-        gt_labels_3d = []
-        for cat in gt_names_3d:
-            if cat in self.CLASSES:
-                gt_labels_3d.append(self.CLASSES.index(cat))
-            else:
-                gt_labels_3d.append(-1)
-        gt_labels_3d = np.array(gt_labels_3d)
-
-        if self.with_velocity:
-            gt_velocity = info['gt_velocity'][mask]
-            nan_mask = np.isnan(gt_velocity[:, 0])
-            gt_velocity[nan_mask] = [0.0, 0.0]
-            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
-
-        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
-        # the same as KITTI (0.5, 0.5, 0)
-        gt_bboxes_3d = LiDARInstance3DBoxes(
-            gt_bboxes_3d,
-            box_dim=gt_bboxes_3d.shape[-1],
-            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
-
-        anns_results = dict(
-            gt_bboxes_3d=gt_bboxes_3d,
-            gt_labels_3d=gt_labels_3d,
-            gt_names=gt_names_3d)
-        return anns_results
-
-    def _format_bbox(self, results, jsonfile_prefix=None):
-        """Convert the results to the standard format.
-
-        Args:
-            results (list[dict]): Testing results of the dataset.
-            jsonfile_prefix (str): The prefix of the output jsonfile.
-                You can specify the output directory/filename by
-                modifying the jsonfile_prefix. Default: None.
-
-        Returns:
-            str: Path of the output json file.
-        """
-        nusc_annos = {}
-        mapped_class_names = self.CLASSES
-
-        print('Start to convert detection format...')
-        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
-            annos = []
-            boxes = output_to_nusc_box(det)
-            sample_token = self.data_infos[sample_id]['token']
-            boxes = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
-                                             mapped_class_names,
-                                             self.eval_detection_configs,
-                                             self.eval_version)
-            for i, box in enumerate(boxes):
-                name = mapped_class_names[box.label]
-                if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
-                    if name in [
-                            'car',
-                            'construction_vehicle',
-                            'bus',
-                            'truck',
-                            'trailer',
-                    ]:
-                        attr = 'vehicle.moving'
-                    elif name in ['bicycle', 'motorcycle']:
-                        attr = 'cycle.with_rider'
-                    else:
-                        attr = NuScenesDataset.DefaultAttribute[name]
-                else:
-                    if name in ['pedestrian']:
-                        attr = 'pedestrian.standing'
-                    elif name in ['bus']:
-                        attr = 'vehicle.stopped'
-                    else:
-                        attr = NuScenesDataset.DefaultAttribute[name]
-
-                nusc_anno = dict(
-                    sample_token=sample_token,
-                    translation=box.center.tolist(),
-                    size=box.wlh.tolist(),
-                    rotation=box.orientation.elements.tolist(),
-                    velocity=box.velocity[:2].tolist(),
-                    detection_name=name,
-                    detection_score=box.score,
-                    attribute_name=attr)
-                annos.append(nusc_anno)
-            nusc_annos[sample_token] = annos
-        nusc_submissions = {
-            'meta': self.modality,
-            'results': nusc_annos,
-        }
-
-        mmcv.mkdir_or_exist(jsonfile_prefix)
-        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
-        print('Results writes to', res_path)
-        mmcv.dump(nusc_submissions, res_path)
-        return res_path
-
-    def _evaluate_single(self,
-                         result_path,
-                         logger=None,
-                         metric='bbox',
-                         result_name='pts_bbox'):
-        """Evaluation for a single model in nuScenes protocol.
-
-        Args:
-            result_path (str): Path of the result file.
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Default: None.
-            metric (str, optional): Metric name used for evaluation.
-                Default: 'bbox'.
-            result_name (str, optional): Result name in the metric prefix.
-                Default: 'pts_bbox'.
-
-        Returns:
-            dict: Dictionary of evaluation details.
-        """
-        from nuscenes import NuScenes
-        from nuscenes.eval.detection.evaluate import NuScenesEval
-
-        output_dir = osp.join(*osp.split(result_path)[:-1])
-        nusc = NuScenes(
-            version=self.version, dataroot=self.data_root, verbose=False)
-        eval_set_map = {
-            'v1.0-mini': 'mini_val',
-            'v1.0-trainval': 'val',
-        }
-        nusc_eval = NuScenesEval(
-            nusc,
-            config=self.eval_detection_configs,
-            result_path=result_path,
-            eval_set=eval_set_map[self.version],
-            output_dir=output_dir,
-            verbose=False)
-        nusc_eval.main(render_curves=False)
-
-        # record metrics
-        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
-        detail = dict()
-        metric_prefix = f'{result_name}_NuScenes'
-        for name in self.CLASSES:
-            for k, v in metrics['label_aps'][name].items():
-                val = float('{:.4f}'.format(v))
-                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
-            for k, v in metrics['label_tp_errors'][name].items():
-                val = float('{:.4f}'.format(v))
-                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
-            for k, v in metrics['tp_errors'].items():
-                val = float('{:.4f}'.format(v))
-                detail['{}/{}'.format(metric_prefix,
-                                      self.ErrNameMapping[k])] = val
-
-        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
-        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
-        return detail
-
-    def format_results(self, results, jsonfile_prefix=None):
-        """Format the results to json (standard format for COCO evaluation).
-
-        Args:
-            results (list[dict]): Testing results of the dataset.
-            jsonfile_prefix (str): The prefix of json files. It includes
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-
-        Returns:
-            tuple: Returns (result_files, tmp_dir), where `result_files` is a
-                dict containing the json filepaths, `tmp_dir` is the temporal
-                directory created for saving json files when
-                `jsonfile_prefix` is not specified.
-        """
-        assert isinstance(results, list), 'results must be a list'
-        assert len(results) == len(self), (
-            'The length of results is not equal to the dataset len: {} != {}'.
-            format(len(results), len(self)))
-
-        if jsonfile_prefix is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
-        else:
-            tmp_dir = None
-
-        # currently the output prediction results could be in two formats
-        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
-        # 2. list of dict('pts_bbox' or 'img_bbox':
-        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
-        # this is a workaround to enable evaluation of both formats on nuScenes
-        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
-        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
-            result_files = self._format_bbox(results, jsonfile_prefix)
-        else:
-            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
-            result_files = dict()
-            for name in results[0]:
-                print(f'\nFormating bboxes of {name}')
-                results_ = [out[name] for out in results]
-                tmp_file_ = osp.join(jsonfile_prefix, name)
-                result_files.update(
-                    {name: self._format_bbox(results_, tmp_file_)})
-        return result_files, tmp_dir
-
-    def evaluate(self,
-                 results,
-                 metric='bbox',
-                 logger=None,
-                 jsonfile_prefix=None,
-                 result_names=['pts_bbox'],
-                 show=False,
-                 out_dir=None,
-                 pipeline=None):
-        """Evaluation in nuScenes protocol.
-
-        Args:
-            results (list[dict]): Testing results of the dataset.
-            metric (str | list[str], optional): Metrics to be evaluated.
-                Default: 'bbox'.
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Default: None.
-            jsonfile_prefix (str, optional): The prefix of json files including
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-            show (bool, optional): Whether to visualize.
-                Default: False.
-            out_dir (str, optional): Path to save the visualization results.
-                Default: None.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-
-        Returns:
-            dict[str, float]: Results of each evaluation metric.
-        """
-        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
-
-        if isinstance(result_files, dict):
-            results_dict = dict()
-            for name in result_names:
-                print('Evaluating bboxes of {}'.format(name))
-                ret_dict = self._evaluate_single(result_files[name])
-            results_dict.update(ret_dict)
-        elif isinstance(result_files, str):
-            results_dict = self._evaluate_single(result_files)
-
-        if tmp_dir is not None:
-            tmp_dir.cleanup()
-
-        if show or out_dir:
-            self.show(results, out_dir, show=show, pipeline=pipeline)
-        return results_dict
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        pipeline = [
-            dict(
-                type='LoadPointsFromFile',
-                coord_type='LIDAR',
-                load_dim=5,
-                use_dim=5,
-                file_client_args=dict(backend='disk')),
-            dict(
-                type='LoadPointsFromMultiSweeps',
-                sweeps_num=10,
-                file_client_args=dict(backend='disk')),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=self.CLASSES,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ]
-        return Compose(pipeline)
-
-    def show(self, results, out_dir, show=False, pipeline=None):
-        """Results visualization.
-
-        Args:
-            results (list[dict]): List of bounding boxes results.
-            out_dir (str): Output directory of visualization result.
-            show (bool): Whether to visualize the results online.
-                Default: False.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-        """
-        assert out_dir is not None, 'Expect out_dir, got none.'
-        pipeline = self._get_pipeline(pipeline)
-        for i, result in enumerate(results):
-            if 'pts_bbox' in result.keys():
-                result = result['pts_bbox']
-            data_info = self.data_infos[i]
-            pts_path = data_info['lidar_path']
-            file_name = osp.split(pts_path)[-1].split('.')[0]
-            points = self._extract_data(i, pipeline, 'points').numpy()
-            # for now we convert points into depth mode
-            points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
-                                               Coord3DMode.DEPTH)
-            inds = result['scores_3d'] > 0.1
-            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
-            show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
-                                               Box3DMode.DEPTH)
-            pred_bboxes = result['boxes_3d'][inds].tensor.numpy()
-            show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
-                                                 Box3DMode.DEPTH)
-            show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir,
-                        file_name, show)
-
-
-def output_to_nusc_box(detection):
-    """Convert the output to the box class in the nuScenes.
-
-    Args:
-        detection (dict): Detection results.
-
-            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
-            - scores_3d (torch.Tensor): Detection scores.
-            - labels_3d (torch.Tensor): Predicted box labels.
-
-    Returns:
-        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
-    """
-    box3d = detection['boxes_3d']
-    scores = detection['scores_3d'].numpy()
-    labels = detection['labels_3d'].numpy()
-
-    box_gravity_center = box3d.gravity_center.numpy()
-    box_dims = box3d.dims.numpy()
-    box_yaw = box3d.yaw.numpy()
-
-    # our LiDAR coordinate system -> nuScenes box coordinate system
-    nus_box_dims = box_dims[:, [1, 0, 2]]
-
-    box_list = []
-    for i in range(len(box3d)):
-        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
-        velocity = (*box3d.tensor[i, 7:9], 0.0)
-        # velo_val = np.linalg.norm(box3d[i, 7:9])
-        # velo_ori = box3d[i, 6]
-        # velocity = (
-        # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
-        box = NuScenesBox(
-            box_gravity_center[i],
-            nus_box_dims[i],
-            quat,
-            label=labels[i],
-            score=scores[i],
-            velocity=velocity)
-        box_list.append(box)
-    return box_list
-
-
-def lidar_nusc_box_to_global(info,
-                             boxes,
-                             classes,
-                             eval_configs,
-                             eval_version='detection_cvpr_2019'):
-    """Convert the box from ego to global coordinate.
-
-    Args:
-        info (dict): Info for a specific sample data, including the
-            calibration information.
-        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
-        classes (list[str]): Mapped classes in the evaluation.
-        eval_configs (object): Evaluation configuration object.
-        eval_version (str, optional): Evaluation version.
-            Default: 'detection_cvpr_2019'
-
-    Returns:
-        list: List of standard NuScenesBoxes in the global
-            coordinate.
-    """
-    box_list = []
-    for box in boxes:
-        # Move box to ego vehicle coord system
-        box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation']))
-        box.translate(np.array(info['lidar2ego_translation']))
-        # filter det in ego.
-        cls_range_map = eval_configs.class_range
-        radius = np.linalg.norm(box.center[:2], 2)
-        det_range = cls_range_map[classes[box.label]]
-        if radius > det_range:
-            continue
-        # Move box to global coord system
-        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
-        box.translate(np.array(info['ego2global_translation']))
-        box_list.append(box)
-    return box_list
+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+from os import path as osp
+
+import mmcv
+import numpy as np
+import pyquaternion
+from nuscenes.utils.data_classes import Box as NuScenesBox
+
+from ..core import show_result
+from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes
+from .builder import DATASETS
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class NuScenesDataset(Custom3DDataset):
+    r"""NuScenes Dataset.
+
+    This class serves as the API for experiments on the NuScenes Dataset.
+
+    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
+    for data downloading.
+
+    Args:
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        data_root (str): Path of dataset root.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        with_velocity (bool, optional): Whether include velocity prediction
+            into the experiments. Defaults to True.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes.
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+        eval_version (bool, optional): Configuration version of evaluation.
+            Defaults to  'detection_cvpr_2019'.
+        use_valid_flag (bool, optional): Whether to use `use_valid_flag` key
+            in the info file as mask to filter gt_boxes and gt_names.
+            Defaults to False.
+    """
+    NameMapping = {
+        'movable_object.barrier': 'barrier',
+        'vehicle.bicycle': 'bicycle',
+        'vehicle.bus.bendy': 'bus',
+        'vehicle.bus.rigid': 'bus',
+        'vehicle.car': 'car',
+        'vehicle.construction': 'construction_vehicle',
+        'vehicle.motorcycle': 'motorcycle',
+        'human.pedestrian.adult': 'pedestrian',
+        'human.pedestrian.child': 'pedestrian',
+        'human.pedestrian.construction_worker': 'pedestrian',
+        'human.pedestrian.police_officer': 'pedestrian',
+        'movable_object.trafficcone': 'traffic_cone',
+        'vehicle.trailer': 'trailer',
+        'vehicle.truck': 'truck'
+    }
+    DefaultAttribute = {
+        'car': 'vehicle.parked',
+        'pedestrian': 'pedestrian.moving',
+        'trailer': 'vehicle.parked',
+        'truck': 'vehicle.parked',
+        'bus': 'vehicle.moving',
+        'motorcycle': 'cycle.without_rider',
+        'construction_vehicle': 'vehicle.parked',
+        'bicycle': 'cycle.without_rider',
+        'barrier': '',
+        'traffic_cone': '',
+    }
+    AttrMapping = {
+        'cycle.with_rider': 0,
+        'cycle.without_rider': 1,
+        'pedestrian.moving': 2,
+        'pedestrian.standing': 3,
+        'pedestrian.sitting_lying_down': 4,
+        'vehicle.moving': 5,
+        'vehicle.parked': 6,
+        'vehicle.stopped': 7,
+    }
+    AttrMapping_rev = [
+        'cycle.with_rider',
+        'cycle.without_rider',
+        'pedestrian.moving',
+        'pedestrian.standing',
+        'pedestrian.sitting_lying_down',
+        'vehicle.moving',
+        'vehicle.parked',
+        'vehicle.stopped',
+    ]
+    # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
+    ErrNameMapping = {
+        'trans_err': 'mATE',
+        'scale_err': 'mASE',
+        'orient_err': 'mAOE',
+        'vel_err': 'mAVE',
+        'attr_err': 'mAAE'
+    }
+    CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+               'barrier')
+
+    def __init__(self,
+                 ann_file,
+                 pipeline=None,
+                 data_root=None,
+                 classes=None,
+                 load_interval=1,
+                 with_velocity=True,
+                 modality=None,
+                 box_type_3d='LiDAR',
+                 filter_empty_gt=True,
+                 test_mode=False,
+                 eval_version='detection_cvpr_2019',
+                 use_valid_flag=False):
+        self.load_interval = load_interval
+        self.use_valid_flag = use_valid_flag
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode)
+
+        self.with_velocity = with_velocity
+        self.eval_version = eval_version
+        from nuscenes.eval.detection.config import config_factory
+        self.eval_detection_configs = config_factory(self.eval_version)
+        if self.modality is None:
+            self.modality = dict(
+                use_camera=False,
+                use_lidar=True,
+                use_radar=False,
+                use_map=False,
+                use_external=False,
+            )
+
+    def get_cat_ids(self, idx):
+        """Get category distribution of single scene.
+
+        Args:
+            idx (int): Index of the data_info.
+
+        Returns:
+            dict[list]: for each category, if the current scene
+                contains such boxes, store a list containing idx,
+                otherwise, store empty list.
+        """
+        info = self.data_infos[idx]
+        if self.use_valid_flag:
+            mask = info['valid_flag']
+            gt_names = set(info['gt_names'][mask])
+        else:
+            gt_names = set(info['gt_names'])
+
+        cat_ids = []
+        for name in gt_names:
+            if name in self.CLASSES:
+                cat_ids.append(self.cat2id[name])
+        return cat_ids
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations sorted by timestamps.
+        """
+        data = mmcv.load(ann_file, file_format='pkl')
+        data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp']))
+        data_infos = data_infos[::self.load_interval]
+        self.metadata = data['metadata']
+        self.version = self.metadata['version']
+        return data_infos
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        # standard protocol modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            sweeps=info['sweeps'],
+            timestamp=info['timestamp'] / 1e6,
+        )
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                ))
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        return input_dict
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`):
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+        """
+        info = self.data_infos[index]
+        # filter out bbox containing no points
+        if self.use_valid_flag:
+            mask = info['valid_flag']
+        else:
+            mask = info['num_lidar_pts'] > 0
+        gt_bboxes_3d = info['gt_boxes'][mask]
+        gt_names_3d = info['gt_names'][mask]
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        if self.with_velocity:
+            gt_velocity = info['gt_velocity'][mask]
+            nan_mask = np.isnan(gt_velocity[:, 0])
+            gt_velocity[nan_mask] = [0.0, 0.0]
+            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_names=gt_names_3d)
+        return anns_results
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+            annos = []
+            boxes = output_to_nusc_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+            boxes = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
+                                             mapped_class_names,
+                                             self.eval_detection_configs,
+                                             self.eval_version)
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+                    if name in [
+                            'car',
+                            'construction_vehicle',
+                            'bus',
+                            'truck',
+                            'trailer',
+                    ]:
+                        attr = 'vehicle.moving'
+                    elif name in ['bicycle', 'motorcycle']:
+                        attr = 'cycle.with_rider'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+                else:
+                    if name in ['pedestrian']:
+                        attr = 'pedestrian.standing'
+                    elif name in ['bus']:
+                        attr = 'vehicle.stopped'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr)
+                annos.append(nusc_anno)
+            nusc_annos[sample_token] = annos
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str, optional): Metric name used for evaluation.
+                Default: 'bbox'.
+            result_name (str, optional): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        from nuscenes.eval.detection.evaluate import NuScenesEval
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        nusc = NuScenes(
+            version=self.version, dataroot=self.data_root, verbose=False)
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        nusc_eval = NuScenesEval(
+            nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=False)
+        nusc_eval.main(render_curves=False)
+
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
+
+    def format_results(self, results, jsonfile_prefix=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: Returns (result_files, tmp_dir), where `result_files` is a
+                dict containing the json filepaths, `tmp_dir` is the temporal
+                directory created for saving json files when
+                `jsonfile_prefix` is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        # currently the output prediction results could be in two formats
+        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+        # 2. list of dict('pts_bbox' or 'img_bbox':
+        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+        # this is a workaround to enable evaluation of both formats on nuScenes
+        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['pts_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str], optional): Metrics to be evaluated.
+                Default: 'bbox'.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str, optional): The prefix of json files including
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool, optional): Whether to visualize.
+                Default: False.
+            out_dir (str, optional): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name])
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show or out_dir:
+            self.show(results, out_dir, show=show, pipeline=pipeline)
+        return results_dict
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='LIDAR',
+                load_dim=5,
+                use_dim=5,
+                file_client_args=dict(backend='disk')),
+            dict(
+                type='LoadPointsFromMultiSweeps',
+                sweeps_num=10,
+                file_client_args=dict(backend='disk')),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ]
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=False, pipeline=None):
+        """Results visualization.
+
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Whether to visualize the results online.
+                Default: False.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            if 'pts_bbox' in result.keys():
+                result = result['pts_bbox']
+            data_info = self.data_infos[i]
+            pts_path = data_info['lidar_path']
+            file_name = osp.split(pts_path)[-1].split('.')[0]
+            points = self._extract_data(i, pipeline, 'points').numpy()
+            # for now we convert points into depth mode
+            points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
+                                               Coord3DMode.DEPTH)
+            inds = result['scores_3d'] > 0.1
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
+            show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
+                                               Box3DMode.DEPTH)
+            pred_bboxes = result['boxes_3d'][inds].tensor.numpy()
+            show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
+                                                 Box3DMode.DEPTH)
+            show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir,
+                        file_name, show)
+
+
+def output_to_nusc_box(detection):
+    """Convert the output to the box class in the nuScenes.
+
+    Args:
+        detection (dict): Detection results.
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+
+    # our LiDAR coordinate system -> nuScenes box coordinate system
+    nus_box_dims = box_dims[:, [1, 0, 2]]
+
+    box_list = []
+    for i in range(len(box3d)):
+        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        velocity = (*box3d.tensor[i, 7:9], 0.0)
+        # velo_val = np.linalg.norm(box3d[i, 7:9])
+        # velo_ori = box3d[i, 6]
+        # velocity = (
+        # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+        box = NuScenesBox(
+            box_gravity_center[i],
+            nus_box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list
+
+
+def lidar_nusc_box_to_global(info,
+                             boxes,
+                             classes,
+                             eval_configs,
+                             eval_version='detection_cvpr_2019'):
+    """Convert the box from ego to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str, optional): Evaluation version.
+            Default: 'detection_cvpr_2019'
+
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation']))
+        box.translate(np.array(info['lidar2ego_translation']))
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+    return box_list
diff --git a/mmdet3d/datasets/nuscenes_mono_dataset.py b/mmdet3d/datasets/nuscenes_mono_dataset.py
index c3eb8f1..1da4961 100644
--- a/mmdet3d/datasets/nuscenes_mono_dataset.py
+++ b/mmdet3d/datasets/nuscenes_mono_dataset.py
@@ -1,840 +1,840 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import tempfile
-import warnings
-from os import path as osp
-
-import mmcv
-import numpy as np
-import pyquaternion
-import torch
-from nuscenes.utils.data_classes import Box as NuScenesBox
-
-from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr
-from mmdet.datasets import CocoDataset
-from ..core import show_multi_modality_result
-from ..core.bbox import CameraInstance3DBoxes, get_box_type
-from .builder import DATASETS
-from .pipelines import Compose
-from .utils import extract_result_dict, get_loading_pipeline
-
-
-@DATASETS.register_module()
-class NuScenesMonoDataset(CocoDataset):
-    r"""Monocular 3D detection on NuScenes Dataset.
-
-    This class serves as the API for experiments on the NuScenes Dataset.
-
-    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
-    for data downloading.
-
-    Args:
-        ann_file (str): Path of annotation file.
-        data_root (str): Path of dataset root.
-        load_interval (int, optional): Interval of loading the dataset. It is
-            used to uniformly sample the dataset. Defaults to 1.
-        with_velocity (bool, optional): Whether include velocity prediction
-            into the experiments. Defaults to True.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'Camera' in this class. Available options includes.
-            - 'LiDAR': Box in LiDAR coordinates.
-            - 'Depth': Box in depth coordinates, usually for indoor dataset.
-            - 'Camera': Box in camera coordinates.
-        eval_version (str, optional): Configuration version of evaluation.
-            Defaults to  'detection_cvpr_2019'.
-        use_valid_flag (bool, optional): Whether to use `use_valid_flag` key
-            in the info file as mask to filter gt_boxes and gt_names.
-            Defaults to False.
-        version (str, optional): Dataset version. Defaults to 'v1.0-trainval'.
-    """
-    CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
-               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
-               'barrier')
-    DefaultAttribute = {
-        'car': 'vehicle.parked',
-        'pedestrian': 'pedestrian.moving',
-        'trailer': 'vehicle.parked',
-        'truck': 'vehicle.parked',
-        'bus': 'vehicle.moving',
-        'motorcycle': 'cycle.without_rider',
-        'construction_vehicle': 'vehicle.parked',
-        'bicycle': 'cycle.without_rider',
-        'barrier': '',
-        'traffic_cone': '',
-    }
-    # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
-    ErrNameMapping = {
-        'trans_err': 'mATE',
-        'scale_err': 'mASE',
-        'orient_err': 'mAOE',
-        'vel_err': 'mAVE',
-        'attr_err': 'mAAE'
-    }
-
-    def __init__(self,
-                 data_root,
-                 ann_file,
-                 pipeline,
-                 load_interval=1,
-                 with_velocity=True,
-                 modality=None,
-                 box_type_3d='Camera',
-                 eval_version='detection_cvpr_2019',
-                 use_valid_flag=False,
-                 version='v1.0-trainval',
-                 classes=None,
-                 img_prefix='',
-                 seg_prefix=None,
-                 proposal_file=None,
-                 test_mode=False,
-                 filter_empty_gt=True,
-                 file_client_args=dict(backend='disk')):
-        self.ann_file = ann_file
-        self.data_root = data_root
-        self.img_prefix = img_prefix
-        self.seg_prefix = seg_prefix
-        self.proposal_file = proposal_file
-        self.test_mode = test_mode
-        self.filter_empty_gt = filter_empty_gt
-        self.CLASSES = self.get_classes(classes)
-        self.file_client = mmcv.FileClient(**file_client_args)
-
-        # load annotations (and proposals)
-        with self.file_client.get_local_path(self.ann_file) as local_path:
-            self.data_infos = self.load_annotations(local_path)
-
-        if self.proposal_file is not None:
-            with self.file_client.get_local_path(
-                    self.proposal_file) as local_path:
-                self.proposals = self.load_proposals(local_path)
-        else:
-            self.proposals = None
-
-        # filter images too small and containing no annotations
-        if not test_mode:
-            valid_inds = self._filter_imgs()
-            self.data_infos = [self.data_infos[i] for i in valid_inds]
-            if self.proposals is not None:
-                self.proposals = [self.proposals[i] for i in valid_inds]
-            # set group flag for the sampler
-            self._set_group_flag()
-
-        # processing pipeline
-        self.pipeline = Compose(pipeline)
-
-        self.load_interval = load_interval
-        self.with_velocity = with_velocity
-        self.modality = modality
-        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
-        self.eval_version = eval_version
-        self.use_valid_flag = use_valid_flag
-        self.bbox_code_size = 9
-        self.version = version
-        if self.eval_version is not None:
-            from nuscenes.eval.detection.config import config_factory
-            self.eval_detection_configs = config_factory(self.eval_version)
-        if self.modality is None:
-            self.modality = dict(
-                use_camera=True,
-                use_lidar=False,
-                use_radar=False,
-                use_map=False,
-                use_external=False)
-
-    def pre_pipeline(self, results):
-        """Initialization before data preparation.
-
-        Args:
-            results (dict): Dict before data preprocessing.
-
-                - img_fields (list): Image fields.
-                - bbox3d_fields (list): 3D bounding boxes fields.
-                - pts_mask_fields (list): Mask fields of points.
-                - pts_seg_fields (list): Mask fields of point segments.
-                - bbox_fields (list): Fields of bounding boxes.
-                - mask_fields (list): Fields of masks.
-                - seg_fields (list): Segment fields.
-                - box_type_3d (str): 3D box type.
-                - box_mode_3d (str): 3D box mode.
-        """
-        results['img_prefix'] = self.img_prefix
-        results['seg_prefix'] = self.seg_prefix
-        results['proposal_file'] = self.proposal_file
-        results['img_fields'] = []
-        results['bbox3d_fields'] = []
-        results['pts_mask_fields'] = []
-        results['pts_seg_fields'] = []
-        results['bbox_fields'] = []
-        results['mask_fields'] = []
-        results['seg_fields'] = []
-        results['box_type_3d'] = self.box_type_3d
-        results['box_mode_3d'] = self.box_mode_3d
-
-    def _parse_ann_info(self, img_info, ann_info):
-        """Parse bbox annotation.
-
-        Args:
-            img_info (list[dict]): Image info.
-            ann_info (list[dict]): Annotation info of an image.
-
-        Returns:
-            dict: A dict containing the following keys: bboxes, labels,
-                gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d,
-                depths, bboxes_ignore, masks, seg_map
-        """
-        gt_bboxes = []
-        gt_labels = []
-        attr_labels = []
-        gt_bboxes_ignore = []
-        gt_masks_ann = []
-        gt_bboxes_cam3d = []
-        centers2d = []
-        depths = []
-        for i, ann in enumerate(ann_info):
-            if ann.get('ignore', False):
-                continue
-            x1, y1, w, h = ann['bbox']
-            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
-            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
-            if inter_w * inter_h == 0:
-                continue
-            if ann['area'] <= 0 or w < 1 or h < 1:
-                continue
-            if ann['category_id'] not in self.cat_ids:
-                continue
-            bbox = [x1, y1, x1 + w, y1 + h]
-            if ann.get('iscrowd', False):
-                gt_bboxes_ignore.append(bbox)
-            else:
-                gt_bboxes.append(bbox)
-                gt_labels.append(self.cat2label[ann['category_id']])
-                attr_labels.append(ann['attribute_id'])
-                gt_masks_ann.append(ann.get('segmentation', None))
-                # 3D annotations in camera coordinates
-                bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1)
-                velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2)
-                nan_mask = np.isnan(velo_cam3d[:, 0])
-                velo_cam3d[nan_mask] = [0.0, 0.0]
-                bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1)
-                gt_bboxes_cam3d.append(bbox_cam3d.squeeze())
-                # 2.5D annotations in camera coordinates
-                center2d = ann['center2d'][:2]
-                depth = ann['center2d'][2]
-                centers2d.append(center2d)
-                depths.append(depth)
-
-        if gt_bboxes:
-            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
-            gt_labels = np.array(gt_labels, dtype=np.int64)
-            attr_labels = np.array(attr_labels, dtype=np.int64)
-        else:
-            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
-            gt_labels = np.array([], dtype=np.int64)
-            attr_labels = np.array([], dtype=np.int64)
-
-        if gt_bboxes_cam3d:
-            gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32)
-            centers2d = np.array(centers2d, dtype=np.float32)
-            depths = np.array(depths, dtype=np.float32)
-        else:
-            gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size),
-                                       dtype=np.float32)
-            centers2d = np.zeros((0, 2), dtype=np.float32)
-            depths = np.zeros((0), dtype=np.float32)
-
-        gt_bboxes_cam3d = CameraInstance3DBoxes(
-            gt_bboxes_cam3d,
-            box_dim=gt_bboxes_cam3d.shape[-1],
-            origin=(0.5, 0.5, 0.5))
-        gt_labels_3d = copy.deepcopy(gt_labels)
-
-        if gt_bboxes_ignore:
-            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
-        else:
-            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
-
-        seg_map = img_info['filename'].replace('jpg', 'png')
-
-        ann = dict(
-            bboxes=gt_bboxes,
-            labels=gt_labels,
-            gt_bboxes_3d=gt_bboxes_cam3d,
-            gt_labels_3d=gt_labels_3d,
-            attr_labels=attr_labels,
-            centers2d=centers2d,
-            depths=depths,
-            bboxes_ignore=gt_bboxes_ignore,
-            masks=gt_masks_ann,
-            seg_map=seg_map)
-
-        return ann
-
-    def get_attr_name(self, attr_idx, label_name):
-        """Get attribute from predicted index.
-
-        This is a workaround to predict attribute when the predicted velocity
-        is not reliable. We map the predicted attribute index to the one
-        in the attribute set. If it is consistent with the category, we will
-        keep it. Otherwise, we will use the default attribute.
-
-        Args:
-            attr_idx (int): Attribute index.
-            label_name (str): Predicted category name.
-
-        Returns:
-            str: Predicted attribute name.
-        """
-        # TODO: Simplify the variable name
-        AttrMapping_rev2 = [
-            'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving',
-            'pedestrian.standing', 'pedestrian.sitting_lying_down',
-            'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None'
-        ]
-        if label_name == 'car' or label_name == 'bus' \
-            or label_name == 'truck' or label_name == 'trailer' \
-                or label_name == 'construction_vehicle':
-            if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \
-                AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \
-                    AttrMapping_rev2[attr_idx] == 'vehicle.stopped':
-                return AttrMapping_rev2[attr_idx]
-            else:
-                return NuScenesMonoDataset.DefaultAttribute[label_name]
-        elif label_name == 'pedestrian':
-            if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \
-                AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \
-                    AttrMapping_rev2[attr_idx] == \
-                    'pedestrian.sitting_lying_down':
-                return AttrMapping_rev2[attr_idx]
-            else:
-                return NuScenesMonoDataset.DefaultAttribute[label_name]
-        elif label_name == 'bicycle' or label_name == 'motorcycle':
-            if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \
-                    AttrMapping_rev2[attr_idx] == 'cycle.without_rider':
-                return AttrMapping_rev2[attr_idx]
-            else:
-                return NuScenesMonoDataset.DefaultAttribute[label_name]
-        else:
-            return NuScenesMonoDataset.DefaultAttribute[label_name]
-
-    def _format_bbox(self, results, jsonfile_prefix=None):
-        """Convert the results to the standard format.
-
-        Args:
-            results (list[dict]): Testing results of the dataset.
-            jsonfile_prefix (str): The prefix of the output jsonfile.
-                You can specify the output directory/filename by
-                modifying the jsonfile_prefix. Default: None.
-
-        Returns:
-            str: Path of the output json file.
-        """
-        nusc_annos = {}
-        mapped_class_names = self.CLASSES
-
-        print('Start to convert detection format...')
-
-        CAM_NUM = 6
-
-        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
-
-            if sample_id % CAM_NUM == 0:
-                boxes_per_frame = []
-                attrs_per_frame = []
-
-            # need to merge results from images of the same sample
-            annos = []
-            boxes, attrs = output_to_nusc_box(det)
-            sample_token = self.data_infos[sample_id]['token']
-            boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id],
-                                                  boxes, attrs,
-                                                  mapped_class_names,
-                                                  self.eval_detection_configs,
-                                                  self.eval_version)
-
-            boxes_per_frame.extend(boxes)
-            attrs_per_frame.extend(attrs)
-            # Remove redundant predictions caused by overlap of images
-            if (sample_id + 1) % CAM_NUM != 0:
-                continue
-            boxes = global_nusc_box_to_cam(
-                self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame,
-                mapped_class_names, self.eval_detection_configs,
-                self.eval_version)
-            cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
-            # box nms 3d over 6 images in a frame
-            # TODO: move this global setting into config
-            nms_cfg = dict(
-                use_rotate_nms=True,
-                nms_across_levels=False,
-                nms_pre=4096,
-                nms_thr=0.05,
-                score_thr=0.01,
-                min_bbox_size=0,
-                max_per_frame=500)
-            from mmcv import Config
-            nms_cfg = Config(nms_cfg)
-            cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
-            boxes3d = cam_boxes3d.tensor
-            # generate attr scores from attr labels
-            attrs = labels.new_tensor([attr for attr in attrs_per_frame])
-            boxes3d, scores, labels, attrs = box3d_multiclass_nms(
-                boxes3d,
-                cam_boxes3d_for_nms,
-                scores,
-                nms_cfg.score_thr,
-                nms_cfg.max_per_frame,
-                nms_cfg,
-                mlvl_attr_scores=attrs)
-            cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
-            det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
-            boxes, attrs = output_to_nusc_box(det)
-            boxes, attrs = cam_nusc_box_to_global(
-                self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs,
-                mapped_class_names, self.eval_detection_configs,
-                self.eval_version)
-
-            for i, box in enumerate(boxes):
-                name = mapped_class_names[box.label]
-                attr = self.get_attr_name(attrs[i], name)
-                nusc_anno = dict(
-                    sample_token=sample_token,
-                    translation=box.center.tolist(),
-                    size=box.wlh.tolist(),
-                    rotation=box.orientation.elements.tolist(),
-                    velocity=box.velocity[:2].tolist(),
-                    detection_name=name,
-                    detection_score=box.score,
-                    attribute_name=attr)
-                annos.append(nusc_anno)
-            # other views results of the same frame should be concatenated
-            if sample_token in nusc_annos:
-                nusc_annos[sample_token].extend(annos)
-            else:
-                nusc_annos[sample_token] = annos
-
-        nusc_submissions = {
-            'meta': self.modality,
-            'results': nusc_annos,
-        }
-
-        mmcv.mkdir_or_exist(jsonfile_prefix)
-        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
-        print('Results writes to', res_path)
-        mmcv.dump(nusc_submissions, res_path)
-        return res_path
-
-    def _evaluate_single(self,
-                         result_path,
-                         logger=None,
-                         metric='bbox',
-                         result_name='img_bbox'):
-        """Evaluation for a single model in nuScenes protocol.
-
-        Args:
-            result_path (str): Path of the result file.
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Default: None.
-            metric (str, optional): Metric name used for evaluation.
-                Default: 'bbox'.
-            result_name (str, optional): Result name in the metric prefix.
-                Default: 'img_bbox'.
-
-        Returns:
-            dict: Dictionary of evaluation details.
-        """
-        from nuscenes import NuScenes
-        from nuscenes.eval.detection.evaluate import NuScenesEval
-
-        output_dir = osp.join(*osp.split(result_path)[:-1])
-        nusc = NuScenes(
-            version=self.version, dataroot=self.data_root, verbose=False)
-        eval_set_map = {
-            'v1.0-mini': 'mini_val',
-            'v1.0-trainval': 'val',
-        }
-        nusc_eval = NuScenesEval(
-            nusc,
-            config=self.eval_detection_configs,
-            result_path=result_path,
-            eval_set=eval_set_map[self.version],
-            output_dir=output_dir,
-            verbose=False)
-        nusc_eval.main(render_curves=True)
-
-        # record metrics
-        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
-        detail = dict()
-        metric_prefix = f'{result_name}_NuScenes'
-        for name in self.CLASSES:
-            for k, v in metrics['label_aps'][name].items():
-                val = float('{:.4f}'.format(v))
-                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
-            for k, v in metrics['label_tp_errors'][name].items():
-                val = float('{:.4f}'.format(v))
-                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
-            for k, v in metrics['tp_errors'].items():
-                val = float('{:.4f}'.format(v))
-                detail['{}/{}'.format(metric_prefix,
-                                      self.ErrNameMapping[k])] = val
-
-        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
-        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
-        return detail
-
-    def format_results(self, results, jsonfile_prefix=None, **kwargs):
-        """Format the results to json (standard format for COCO evaluation).
-
-        Args:
-            results (list[tuple | numpy.ndarray]): Testing results of the
-                dataset.
-            jsonfile_prefix (str): The prefix of json files. It includes
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-
-        Returns:
-            tuple: (result_files, tmp_dir), result_files is a dict containing
-                the json filepaths, tmp_dir is the temporal directory created
-                for saving json files when jsonfile_prefix is not specified.
-        """
-        assert isinstance(results, list), 'results must be a list'
-        assert len(results) == len(self), (
-            'The length of results is not equal to the dataset len: {} != {}'.
-            format(len(results), len(self)))
-
-        if jsonfile_prefix is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
-        else:
-            tmp_dir = None
-
-        # currently the output prediction results could be in two formats
-        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
-        # 2. list of dict('pts_bbox' or 'img_bbox':
-        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
-        # this is a workaround to enable evaluation of both formats on nuScenes
-        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
-        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
-            result_files = self._format_bbox(results, jsonfile_prefix)
-        else:
-            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
-            result_files = dict()
-            for name in results[0]:
-                # not evaluate 2D predictions on nuScenes
-                if '2d' in name:
-                    continue
-                print(f'\nFormating bboxes of {name}')
-                results_ = [out[name] for out in results]
-                tmp_file_ = osp.join(jsonfile_prefix, name)
-                result_files.update(
-                    {name: self._format_bbox(results_, tmp_file_)})
-
-        return result_files, tmp_dir
-
-    def evaluate(self,
-                 results,
-                 metric='bbox',
-                 logger=None,
-                 jsonfile_prefix=None,
-                 result_names=['img_bbox'],
-                 show=False,
-                 out_dir=None,
-                 pipeline=None):
-        """Evaluation in nuScenes protocol.
-
-        Args:
-            results (list[dict]): Testing results of the dataset.
-            metric (str | list[str], optional): Metrics to be evaluated.
-                Default: 'bbox'.
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Default: None.
-            jsonfile_prefix (str): The prefix of json files. It includes
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-            result_names (list[str], optional): Result names in the
-                metric prefix. Default: ['img_bbox'].
-            show (bool, optional): Whether to visualize.
-                Default: False.
-            out_dir (str, optional): Path to save the visualization results.
-                Default: None.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-
-        Returns:
-            dict[str, float]: Results of each evaluation metric.
-        """
-
-        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
-
-        if isinstance(result_files, dict):
-            results_dict = dict()
-            for name in result_names:
-                print('Evaluating bboxes of {}'.format(name))
-                ret_dict = self._evaluate_single(result_files[name])
-            results_dict.update(ret_dict)
-        elif isinstance(result_files, str):
-            results_dict = self._evaluate_single(result_files)
-
-        if tmp_dir is not None:
-            tmp_dir.cleanup()
-
-        if show or out_dir:
-            self.show(results, out_dir, pipeline=pipeline)
-        return results_dict
-
-    def _extract_data(self, index, pipeline, key, load_annos=False):
-        """Load data using input pipeline and extract data according to key.
-
-        Args:
-            index (int): Index for accessing the target data.
-            pipeline (:obj:`Compose`): Composed data loading pipeline.
-            key (str | list[str]): One single or a list of data key.
-            load_annos (bool): Whether to load data annotations.
-                If True, need to set self.test_mode as False before loading.
-
-        Returns:
-            np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
-                A single or a list of loaded data.
-        """
-        assert pipeline is not None, 'data loading pipeline is not provided'
-        img_info = self.data_infos[index]
-        input_dict = dict(img_info=img_info)
-
-        if load_annos:
-            ann_info = self.get_ann_info(index)
-            input_dict.update(dict(ann_info=ann_info))
-
-        self.pre_pipeline(input_dict)
-        example = pipeline(input_dict)
-
-        # extract data items according to keys
-        if isinstance(key, str):
-            data = extract_result_dict(example, key)
-        else:
-            data = [extract_result_dict(example, k) for k in key]
-
-        return data
-
-    def _get_pipeline(self, pipeline):
-        """Get data loading pipeline in self.show/evaluate function.
-
-        Args:
-            pipeline (list[dict]): Input pipeline. If None is given,
-                get from self.pipeline.
-        """
-        if pipeline is None:
-            if not hasattr(self, 'pipeline') or self.pipeline is None:
-                warnings.warn(
-                    'Use default pipeline for data loading, this may cause '
-                    'errors when data is on ceph')
-                return self._build_default_pipeline()
-            loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
-            return Compose(loading_pipeline)
-        return Compose(pipeline)
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        pipeline = [
-            dict(type='LoadImageFromFileMono3D'),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=self.CLASSES,
-                with_label=False),
-            dict(type='Collect3D', keys=['img'])
-        ]
-        return Compose(pipeline)
-
-    def show(self, results, out_dir, show=False, pipeline=None):
-        """Results visualization.
-
-        Args:
-            results (list[dict]): List of bounding boxes results.
-            out_dir (str): Output directory of visualization result.
-            show (bool): Whether to visualize the results online.
-                Default: False.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-        """
-        assert out_dir is not None, 'Expect out_dir, got none.'
-        pipeline = self._get_pipeline(pipeline)
-        for i, result in enumerate(results):
-            if 'img_bbox' in result.keys():
-                result = result['img_bbox']
-            data_info = self.data_infos[i]
-            img_path = data_info['file_name']
-            file_name = osp.split(img_path)[-1].split('.')[0]
-            img, img_metas = self._extract_data(i, pipeline,
-                                                ['img', 'img_metas'])
-            # need to transpose channel to first dim
-            img = img.numpy().transpose(1, 2, 0)
-            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d']
-            pred_bboxes = result['boxes_3d']
-            show_multi_modality_result(
-                img,
-                gt_bboxes,
-                pred_bboxes,
-                img_metas['cam2img'],
-                out_dir,
-                file_name,
-                box_mode='camera',
-                show=show)
-
-
-def output_to_nusc_box(detection):
-    """Convert the output to the box class in the nuScenes.
-
-    Args:
-        detection (dict): Detection results.
-
-            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
-            - scores_3d (torch.Tensor): Detection scores.
-            - labels_3d (torch.Tensor): Predicted box labels.
-            - attrs_3d (torch.Tensor, optional): Predicted attributes.
-
-    Returns:
-        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
-    """
-    box3d = detection['boxes_3d']
-    scores = detection['scores_3d'].numpy()
-    labels = detection['labels_3d'].numpy()
-    attrs = None
-    if 'attrs_3d' in detection:
-        attrs = detection['attrs_3d'].numpy()
-
-    box_gravity_center = box3d.gravity_center.numpy()
-    box_dims = box3d.dims.numpy()
-    box_yaw = box3d.yaw.numpy()
-
-    # convert the dim/rot to nuscbox convention
-    box_dims[:, [0, 1, 2]] = box_dims[:, [2, 0, 1]]
-    box_yaw = -box_yaw
-
-    box_list = []
-    for i in range(len(box3d)):
-        q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
-        q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
-        quat = q2 * q1
-        velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8])
-        box = NuScenesBox(
-            box_gravity_center[i],
-            box_dims[i],
-            quat,
-            label=labels[i],
-            score=scores[i],
-            velocity=velocity)
-        box_list.append(box)
-    return box_list, attrs
-
-
-def cam_nusc_box_to_global(info,
-                           boxes,
-                           attrs,
-                           classes,
-                           eval_configs,
-                           eval_version='detection_cvpr_2019'):
-    """Convert the box from camera to global coordinate.
-
-    Args:
-        info (dict): Info for a specific sample data, including the
-            calibration information.
-        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
-        classes (list[str]): Mapped classes in the evaluation.
-        eval_configs (object): Evaluation configuration object.
-        eval_version (str, optional): Evaluation version.
-            Default: 'detection_cvpr_2019'
-
-    Returns:
-        list: List of standard NuScenesBoxes in the global
-            coordinate.
-    """
-    box_list = []
-    attr_list = []
-    for (box, attr) in zip(boxes, attrs):
-        # Move box to ego vehicle coord system
-        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']))
-        box.translate(np.array(info['cam2ego_translation']))
-        # filter det in ego.
-        cls_range_map = eval_configs.class_range
-        radius = np.linalg.norm(box.center[:2], 2)
-        det_range = cls_range_map[classes[box.label]]
-        if radius > det_range:
-            continue
-        # Move box to global coord system
-        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
-        box.translate(np.array(info['ego2global_translation']))
-        box_list.append(box)
-        attr_list.append(attr)
-    return box_list, attr_list
-
-
-def global_nusc_box_to_cam(info,
-                           boxes,
-                           classes,
-                           eval_configs,
-                           eval_version='detection_cvpr_2019'):
-    """Convert the box from global to camera coordinate.
-
-    Args:
-        info (dict): Info for a specific sample data, including the
-            calibration information.
-        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
-        classes (list[str]): Mapped classes in the evaluation.
-        eval_configs (object): Evaluation configuration object.
-        eval_version (str, optional): Evaluation version.
-            Default: 'detection_cvpr_2019'
-
-    Returns:
-        list: List of standard NuScenesBoxes in the global
-            coordinate.
-    """
-    box_list = []
-    for box in boxes:
-        # Move box to ego vehicle coord system
-        box.translate(-np.array(info['ego2global_translation']))
-        box.rotate(
-            pyquaternion.Quaternion(info['ego2global_rotation']).inverse)
-        # filter det in ego.
-        cls_range_map = eval_configs.class_range
-        radius = np.linalg.norm(box.center[:2], 2)
-        det_range = cls_range_map[classes[box.label]]
-        if radius > det_range:
-            continue
-        # Move box to camera coord system
-        box.translate(-np.array(info['cam2ego_translation']))
-        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse)
-        box_list.append(box)
-    return box_list
-
-
-def nusc_box_to_cam_box3d(boxes):
-    """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
-
-    Args:
-        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
-
-    Returns:
-        tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor):
-            Converted 3D bounding boxes, scores and labels.
-    """
-    locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
-    dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
-    rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
-                         for b in boxes]).view(-1, 1)
-    velocity = torch.Tensor([b.velocity[0::2] for b in boxes]).view(-1, 2)
-
-    # convert nusbox to cambox convention
-    dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]]
-    rots = -rots
-
-    boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
-    cam_boxes3d = CameraInstance3DBoxes(
-        boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
-    scores = torch.Tensor([b.score for b in boxes]).cuda()
-    labels = torch.LongTensor([b.label for b in boxes]).cuda()
-    nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
-    indices = labels.new_tensor(list(range(scores.shape[0])))
-    nms_scores[indices, labels] = scores
-    return cam_boxes3d, nms_scores, labels
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import tempfile
+import warnings
+from os import path as osp
+
+import mmcv
+import numpy as np
+import pyquaternion
+import torch
+from nuscenes.utils.data_classes import Box as NuScenesBox
+
+from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr
+from mmdet.datasets import CocoDataset
+from ..core import show_multi_modality_result
+from ..core.bbox import CameraInstance3DBoxes, get_box_type
+from .builder import DATASETS
+from .pipelines import Compose
+from .utils import extract_result_dict, get_loading_pipeline
+
+
+@DATASETS.register_module()
+class NuScenesMonoDataset(CocoDataset):
+    r"""Monocular 3D detection on NuScenes Dataset.
+
+    This class serves as the API for experiments on the NuScenes Dataset.
+
+    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
+    for data downloading.
+
+    Args:
+        ann_file (str): Path of annotation file.
+        data_root (str): Path of dataset root.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        with_velocity (bool, optional): Whether include velocity prediction
+            into the experiments. Defaults to True.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Camera' in this class. Available options includes.
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        eval_version (str, optional): Configuration version of evaluation.
+            Defaults to  'detection_cvpr_2019'.
+        use_valid_flag (bool, optional): Whether to use `use_valid_flag` key
+            in the info file as mask to filter gt_boxes and gt_names.
+            Defaults to False.
+        version (str, optional): Dataset version. Defaults to 'v1.0-trainval'.
+    """
+    CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+               'barrier')
+    DefaultAttribute = {
+        'car': 'vehicle.parked',
+        'pedestrian': 'pedestrian.moving',
+        'trailer': 'vehicle.parked',
+        'truck': 'vehicle.parked',
+        'bus': 'vehicle.moving',
+        'motorcycle': 'cycle.without_rider',
+        'construction_vehicle': 'vehicle.parked',
+        'bicycle': 'cycle.without_rider',
+        'barrier': '',
+        'traffic_cone': '',
+    }
+    # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
+    ErrNameMapping = {
+        'trans_err': 'mATE',
+        'scale_err': 'mASE',
+        'orient_err': 'mAOE',
+        'vel_err': 'mAVE',
+        'attr_err': 'mAAE'
+    }
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 pipeline,
+                 load_interval=1,
+                 with_velocity=True,
+                 modality=None,
+                 box_type_3d='Camera',
+                 eval_version='detection_cvpr_2019',
+                 use_valid_flag=False,
+                 version='v1.0-trainval',
+                 classes=None,
+                 img_prefix='',
+                 seg_prefix=None,
+                 proposal_file=None,
+                 test_mode=False,
+                 filter_empty_gt=True,
+                 file_client_args=dict(backend='disk')):
+        self.ann_file = ann_file
+        self.data_root = data_root
+        self.img_prefix = img_prefix
+        self.seg_prefix = seg_prefix
+        self.proposal_file = proposal_file
+        self.test_mode = test_mode
+        self.filter_empty_gt = filter_empty_gt
+        self.CLASSES = self.get_classes(classes)
+        self.file_client = mmcv.FileClient(**file_client_args)
+
+        # load annotations (and proposals)
+        with self.file_client.get_local_path(self.ann_file) as local_path:
+            self.data_infos = self.load_annotations(local_path)
+
+        if self.proposal_file is not None:
+            with self.file_client.get_local_path(
+                    self.proposal_file) as local_path:
+                self.proposals = self.load_proposals(local_path)
+        else:
+            self.proposals = None
+
+        # filter images too small and containing no annotations
+        if not test_mode:
+            valid_inds = self._filter_imgs()
+            self.data_infos = [self.data_infos[i] for i in valid_inds]
+            if self.proposals is not None:
+                self.proposals = [self.proposals[i] for i in valid_inds]
+            # set group flag for the sampler
+            self._set_group_flag()
+
+        # processing pipeline
+        self.pipeline = Compose(pipeline)
+
+        self.load_interval = load_interval
+        self.with_velocity = with_velocity
+        self.modality = modality
+        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+        self.eval_version = eval_version
+        self.use_valid_flag = use_valid_flag
+        self.bbox_code_size = 9
+        self.version = version
+        if self.eval_version is not None:
+            from nuscenes.eval.detection.config import config_factory
+            self.eval_detection_configs = config_factory(self.eval_version)
+        if self.modality is None:
+            self.modality = dict(
+                use_camera=True,
+                use_lidar=False,
+                use_radar=False,
+                use_map=False,
+                use_external=False)
+
+    def pre_pipeline(self, results):
+        """Initialization before data preparation.
+
+        Args:
+            results (dict): Dict before data preprocessing.
+
+                - img_fields (list): Image fields.
+                - bbox3d_fields (list): 3D bounding boxes fields.
+                - pts_mask_fields (list): Mask fields of points.
+                - pts_seg_fields (list): Mask fields of point segments.
+                - bbox_fields (list): Fields of bounding boxes.
+                - mask_fields (list): Fields of masks.
+                - seg_fields (list): Segment fields.
+                - box_type_3d (str): 3D box type.
+                - box_mode_3d (str): 3D box mode.
+        """
+        results['img_prefix'] = self.img_prefix
+        results['seg_prefix'] = self.seg_prefix
+        results['proposal_file'] = self.proposal_file
+        results['img_fields'] = []
+        results['bbox3d_fields'] = []
+        results['pts_mask_fields'] = []
+        results['pts_seg_fields'] = []
+        results['bbox_fields'] = []
+        results['mask_fields'] = []
+        results['seg_fields'] = []
+        results['box_type_3d'] = self.box_type_3d
+        results['box_mode_3d'] = self.box_mode_3d
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox annotation.
+
+        Args:
+            img_info (list[dict]): Image info.
+            ann_info (list[dict]): Annotation info of an image.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, labels,
+                gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d,
+                depths, bboxes_ignore, masks, seg_map
+        """
+        gt_bboxes = []
+        gt_labels = []
+        attr_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+        gt_bboxes_cam3d = []
+        centers2d = []
+        depths = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                attr_labels.append(ann['attribute_id'])
+                gt_masks_ann.append(ann.get('segmentation', None))
+                # 3D annotations in camera coordinates
+                bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1)
+                velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2)
+                nan_mask = np.isnan(velo_cam3d[:, 0])
+                velo_cam3d[nan_mask] = [0.0, 0.0]
+                bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1)
+                gt_bboxes_cam3d.append(bbox_cam3d.squeeze())
+                # 2.5D annotations in camera coordinates
+                center2d = ann['center2d'][:2]
+                depth = ann['center2d'][2]
+                centers2d.append(center2d)
+                depths.append(depth)
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+            attr_labels = np.array(attr_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+            attr_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_cam3d:
+            gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32)
+            centers2d = np.array(centers2d, dtype=np.float32)
+            depths = np.array(depths, dtype=np.float32)
+        else:
+            gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size),
+                                       dtype=np.float32)
+            centers2d = np.zeros((0, 2), dtype=np.float32)
+            depths = np.zeros((0), dtype=np.float32)
+
+        gt_bboxes_cam3d = CameraInstance3DBoxes(
+            gt_bboxes_cam3d,
+            box_dim=gt_bboxes_cam3d.shape[-1],
+            origin=(0.5, 0.5, 0.5))
+        gt_labels_3d = copy.deepcopy(gt_labels)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        seg_map = img_info['filename'].replace('jpg', 'png')
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            gt_bboxes_3d=gt_bboxes_cam3d,
+            gt_labels_3d=gt_labels_3d,
+            attr_labels=attr_labels,
+            centers2d=centers2d,
+            depths=depths,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=seg_map)
+
+        return ann
+
+    def get_attr_name(self, attr_idx, label_name):
+        """Get attribute from predicted index.
+
+        This is a workaround to predict attribute when the predicted velocity
+        is not reliable. We map the predicted attribute index to the one
+        in the attribute set. If it is consistent with the category, we will
+        keep it. Otherwise, we will use the default attribute.
+
+        Args:
+            attr_idx (int): Attribute index.
+            label_name (str): Predicted category name.
+
+        Returns:
+            str: Predicted attribute name.
+        """
+        # TODO: Simplify the variable name
+        AttrMapping_rev2 = [
+            'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving',
+            'pedestrian.standing', 'pedestrian.sitting_lying_down',
+            'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None'
+        ]
+        if label_name == 'car' or label_name == 'bus' \
+            or label_name == 'truck' or label_name == 'trailer' \
+                or label_name == 'construction_vehicle':
+            if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \
+                AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \
+                    AttrMapping_rev2[attr_idx] == 'vehicle.stopped':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return NuScenesMonoDataset.DefaultAttribute[label_name]
+        elif label_name == 'pedestrian':
+            if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \
+                AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \
+                    AttrMapping_rev2[attr_idx] == \
+                    'pedestrian.sitting_lying_down':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return NuScenesMonoDataset.DefaultAttribute[label_name]
+        elif label_name == 'bicycle' or label_name == 'motorcycle':
+            if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \
+                    AttrMapping_rev2[attr_idx] == 'cycle.without_rider':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return NuScenesMonoDataset.DefaultAttribute[label_name]
+        else:
+            return NuScenesMonoDataset.DefaultAttribute[label_name]
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+
+        CAM_NUM = 6
+
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+
+            if sample_id % CAM_NUM == 0:
+                boxes_per_frame = []
+                attrs_per_frame = []
+
+            # need to merge results from images of the same sample
+            annos = []
+            boxes, attrs = output_to_nusc_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+            boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id],
+                                                  boxes, attrs,
+                                                  mapped_class_names,
+                                                  self.eval_detection_configs,
+                                                  self.eval_version)
+
+            boxes_per_frame.extend(boxes)
+            attrs_per_frame.extend(attrs)
+            # Remove redundant predictions caused by overlap of images
+            if (sample_id + 1) % CAM_NUM != 0:
+                continue
+            boxes = global_nusc_box_to_cam(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame,
+                mapped_class_names, self.eval_detection_configs,
+                self.eval_version)
+            cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
+            # box nms 3d over 6 images in a frame
+            # TODO: move this global setting into config
+            nms_cfg = dict(
+                use_rotate_nms=True,
+                nms_across_levels=False,
+                nms_pre=4096,
+                nms_thr=0.05,
+                score_thr=0.01,
+                min_bbox_size=0,
+                max_per_frame=500)
+            from mmcv import Config
+            nms_cfg = Config(nms_cfg)
+            cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
+            boxes3d = cam_boxes3d.tensor
+            # generate attr scores from attr labels
+            attrs = labels.new_tensor([attr for attr in attrs_per_frame])
+            boxes3d, scores, labels, attrs = box3d_multiclass_nms(
+                boxes3d,
+                cam_boxes3d_for_nms,
+                scores,
+                nms_cfg.score_thr,
+                nms_cfg.max_per_frame,
+                nms_cfg,
+                mlvl_attr_scores=attrs)
+            cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
+            det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
+            boxes, attrs = output_to_nusc_box(det)
+            boxes, attrs = cam_nusc_box_to_global(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs,
+                mapped_class_names, self.eval_detection_configs,
+                self.eval_version)
+
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                attr = self.get_attr_name(attrs[i], name)
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr)
+                annos.append(nusc_anno)
+            # other views results of the same frame should be concatenated
+            if sample_token in nusc_annos:
+                nusc_annos[sample_token].extend(annos)
+            else:
+                nusc_annos[sample_token] = annos
+
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='img_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str, optional): Metric name used for evaluation.
+                Default: 'bbox'.
+            result_name (str, optional): Result name in the metric prefix.
+                Default: 'img_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        from nuscenes.eval.detection.evaluate import NuScenesEval
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        nusc = NuScenes(
+            version=self.version, dataroot=self.data_root, verbose=False)
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        nusc_eval = NuScenesEval(
+            nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=False)
+        nusc_eval.main(render_curves=True)
+
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
+
+    def format_results(self, results, jsonfile_prefix=None, **kwargs):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[tuple | numpy.ndarray]): Testing results of the
+                dataset.
+            jsonfile_prefix (str): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing
+                the json filepaths, tmp_dir is the temporal directory created
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        # currently the output prediction results could be in two formats
+        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+        # 2. list of dict('pts_bbox' or 'img_bbox':
+        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+        # this is a workaround to enable evaluation of both formats on nuScenes
+        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                # not evaluate 2D predictions on nuScenes
+                if '2d' in name:
+                    continue
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['img_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str], optional): Metrics to be evaluated.
+                Default: 'bbox'.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            result_names (list[str], optional): Result names in the
+                metric prefix. Default: ['img_bbox'].
+            show (bool, optional): Whether to visualize.
+                Default: False.
+            out_dir (str, optional): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name])
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show or out_dir:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+    def _extract_data(self, index, pipeline, key, load_annos=False):
+        """Load data using input pipeline and extract data according to key.
+
+        Args:
+            index (int): Index for accessing the target data.
+            pipeline (:obj:`Compose`): Composed data loading pipeline.
+            key (str | list[str]): One single or a list of data key.
+            load_annos (bool): Whether to load data annotations.
+                If True, need to set self.test_mode as False before loading.
+
+        Returns:
+            np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
+                A single or a list of loaded data.
+        """
+        assert pipeline is not None, 'data loading pipeline is not provided'
+        img_info = self.data_infos[index]
+        input_dict = dict(img_info=img_info)
+
+        if load_annos:
+            ann_info = self.get_ann_info(index)
+            input_dict.update(dict(ann_info=ann_info))
+
+        self.pre_pipeline(input_dict)
+        example = pipeline(input_dict)
+
+        # extract data items according to keys
+        if isinstance(key, str):
+            data = extract_result_dict(example, key)
+        else:
+            data = [extract_result_dict(example, k) for k in key]
+
+        return data
+
+    def _get_pipeline(self, pipeline):
+        """Get data loading pipeline in self.show/evaluate function.
+
+        Args:
+            pipeline (list[dict]): Input pipeline. If None is given,
+                get from self.pipeline.
+        """
+        if pipeline is None:
+            if not hasattr(self, 'pipeline') or self.pipeline is None:
+                warnings.warn(
+                    'Use default pipeline for data loading, this may cause '
+                    'errors when data is on ceph')
+                return self._build_default_pipeline()
+            loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
+            return Compose(loading_pipeline)
+        return Compose(pipeline)
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(type='LoadImageFromFileMono3D'),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['img'])
+        ]
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=False, pipeline=None):
+        """Results visualization.
+
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Whether to visualize the results online.
+                Default: False.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            if 'img_bbox' in result.keys():
+                result = result['img_bbox']
+            data_info = self.data_infos[i]
+            img_path = data_info['file_name']
+            file_name = osp.split(img_path)[-1].split('.')[0]
+            img, img_metas = self._extract_data(i, pipeline,
+                                                ['img', 'img_metas'])
+            # need to transpose channel to first dim
+            img = img.numpy().transpose(1, 2, 0)
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d']
+            pred_bboxes = result['boxes_3d']
+            show_multi_modality_result(
+                img,
+                gt_bboxes,
+                pred_bboxes,
+                img_metas['cam2img'],
+                out_dir,
+                file_name,
+                box_mode='camera',
+                show=show)
+
+
+def output_to_nusc_box(detection):
+    """Convert the output to the box class in the nuScenes.
+
+    Args:
+        detection (dict): Detection results.
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+            - attrs_3d (torch.Tensor, optional): Predicted attributes.
+
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    attrs = None
+    if 'attrs_3d' in detection:
+        attrs = detection['attrs_3d'].numpy()
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+
+    # convert the dim/rot to nuscbox convention
+    box_dims[:, [0, 1, 2]] = box_dims[:, [2, 0, 1]]
+    box_yaw = -box_yaw
+
+    box_list = []
+    for i in range(len(box3d)):
+        q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
+        quat = q2 * q1
+        velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8])
+        box = NuScenesBox(
+            box_gravity_center[i],
+            box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list, attrs
+
+
+def cam_nusc_box_to_global(info,
+                           boxes,
+                           attrs,
+                           classes,
+                           eval_configs,
+                           eval_version='detection_cvpr_2019'):
+    """Convert the box from camera to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str, optional): Evaluation version.
+            Default: 'detection_cvpr_2019'
+
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    attr_list = []
+    for (box, attr) in zip(boxes, attrs):
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']))
+        box.translate(np.array(info['cam2ego_translation']))
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+        attr_list.append(attr)
+    return box_list, attr_list
+
+
+def global_nusc_box_to_cam(info,
+                           boxes,
+                           classes,
+                           eval_configs,
+                           eval_version='detection_cvpr_2019'):
+    """Convert the box from global to camera coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str, optional): Evaluation version.
+            Default: 'detection_cvpr_2019'
+
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.translate(-np.array(info['ego2global_translation']))
+        box.rotate(
+            pyquaternion.Quaternion(info['ego2global_rotation']).inverse)
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to camera coord system
+        box.translate(-np.array(info['cam2ego_translation']))
+        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse)
+        box_list.append(box)
+    return box_list
+
+
+def nusc_box_to_cam_box3d(boxes):
+    """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
+
+    Args:
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+
+    Returns:
+        tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor):
+            Converted 3D bounding boxes, scores and labels.
+    """
+    locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
+    dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
+    rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
+                         for b in boxes]).view(-1, 1)
+    velocity = torch.Tensor([b.velocity[0::2] for b in boxes]).view(-1, 2)
+
+    # convert nusbox to cambox convention
+    dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]]
+    rots = -rots
+
+    boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
+    cam_boxes3d = CameraInstance3DBoxes(
+        boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
+    scores = torch.Tensor([b.score for b in boxes]).cuda()
+    labels = torch.LongTensor([b.label for b in boxes]).cuda()
+    nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
+    indices = labels.new_tensor(list(range(scores.shape[0])))
+    nms_scores[indices, labels] = scores
+    return cam_boxes3d, nms_scores, labels
diff --git a/mmdet3d/datasets/pipelines/.ipynb_checkpoints/__init__-checkpoint.py b/mmdet3d/datasets/pipelines/.ipynb_checkpoints/__init__-checkpoint.py
index 317f605..76c43a6 100644
--- a/mmdet3d/datasets/pipelines/.ipynb_checkpoints/__init__-checkpoint.py
+++ b/mmdet3d/datasets/pipelines/.ipynb_checkpoints/__init__-checkpoint.py
@@ -1,34 +1,34 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .compose import Compose
-from .dbsampler import DataBaseSampler
-from .formating import Collect3D, DefaultFormatBundle, DefaultFormatBundle3D
-from .loading import (LoadAnnotations3D, LoadImageFromFileMono3D,
-                      LoadMultiViewImageFromFiles, LoadPointsFromDict,
-                      LoadPointsFromFile, LoadPointsFromMultiSweeps,
-                      NormalizePointsColor, PointSegClassMapping)
-from .test_time_aug import MultiScaleFlipAug3D
-# yapf: disable
-from .transforms_3d import (AffineResize, BackgroundPointsFilter,
-                            GlobalAlignment, GlobalRotScaleTrans,
-                            IndoorPatchPointSample, IndoorPointSample,
-                            MultiViewWrapper, ObjectNameFilter, ObjectNoise,
-                            ObjectRangeFilter, ObjectSample, PointSample,
-                            PointShuffle, PointsRangeFilter,
-                            RandomDropPointsColor, RandomFlip3D,
-                            RandomJitterPoints, RandomRotate, RandomShiftScale,
-                            RangeLimitedRandomCrop, VoxelBasedPointSampler)
-
-__all__ = [
-    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
-    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
-    'Compose', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
-    'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler',
-    'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample',
-    'PointSample', 'PointSegClassMapping', 'MultiScaleFlipAug3D',
-    'LoadPointsFromMultiSweeps', 'BackgroundPointsFilter',
-    'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample',
-    'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor',
-    'RandomJitterPoints', 'AffineResize', 'RandomShiftScale',
-    'LoadPointsFromDict', 'MultiViewWrapper', 'RandomRotate',
-    'RangeLimitedRandomCrop'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .compose import Compose
+from .dbsampler import DataBaseSampler
+from .formating import Collect3D, DefaultFormatBundle, DefaultFormatBundle3D
+from .loading import (LoadAnnotations3D, LoadImageFromFileMono3D,
+                      LoadMultiViewImageFromFiles, LoadPointsFromDict,
+                      LoadPointsFromFile, LoadPointsFromMultiSweeps,
+                      NormalizePointsColor, PointSegClassMapping)
+from .test_time_aug import MultiScaleFlipAug3D
+# yapf: disable
+from .transforms_3d import (AffineResize, BackgroundPointsFilter,
+                            GlobalAlignment, GlobalRotScaleTrans,
+                            IndoorPatchPointSample, IndoorPointSample,
+                            MultiViewWrapper, ObjectNameFilter, ObjectNoise,
+                            ObjectRangeFilter, ObjectSample, PointSample,
+                            PointShuffle, PointsRangeFilter,
+                            RandomDropPointsColor, RandomFlip3D,
+                            RandomJitterPoints, RandomRotate, RandomShiftScale,
+                            RangeLimitedRandomCrop, VoxelBasedPointSampler)
+
+__all__ = [
+    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
+    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
+    'Compose', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
+    'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler',
+    'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample',
+    'PointSample', 'PointSegClassMapping', 'MultiScaleFlipAug3D',
+    'LoadPointsFromMultiSweeps', 'BackgroundPointsFilter',
+    'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample',
+    'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor',
+    'RandomJitterPoints', 'AffineResize', 'RandomShiftScale',
+    'LoadPointsFromDict', 'MultiViewWrapper', 'RandomRotate',
+    'RangeLimitedRandomCrop'
+]
diff --git a/mmdet3d/datasets/pipelines/.ipynb_checkpoints/transforms_3d-checkpoint.py b/mmdet3d/datasets/pipelines/.ipynb_checkpoints/transforms_3d-checkpoint.py
index d2dc076..a8b65a6 100644
--- a/mmdet3d/datasets/pipelines/.ipynb_checkpoints/transforms_3d-checkpoint.py
+++ b/mmdet3d/datasets/pipelines/.ipynb_checkpoints/transforms_3d-checkpoint.py
@@ -1,1853 +1,1853 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import random
-import warnings
-
-import cv2
-import numpy as np
-from mmcv import is_tuple_of
-from mmcv.utils import build_from_cfg
-
-from mmdet3d.core import VoxelGenerator
-from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes,
-                               LiDARInstance3DBoxes, box_np_ops)
-from mmdet3d.datasets.pipelines.compose import Compose
-from mmdet.datasets.pipelines import RandomCrop, RandomFlip, Rotate
-from ..builder import OBJECTSAMPLERS, PIPELINES
-from .data_augment_utils import noise_per_object_v3_
-
-
-@PIPELINES.register_module()
-class RandomDropPointsColor(object):
-    r"""Randomly set the color of points to all zeros.
-
-    Once this transform is executed, all the points' color will be dropped.
-    Refer to `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/
-    util/transform.py#L223>`_ for more details.
-
-    Args:
-        drop_ratio (float, optional): The probability of dropping point colors.
-            Defaults to 0.2.
-    """
-
-    def __init__(self, drop_ratio=0.2):
-        assert isinstance(drop_ratio, (int, float)) and 0 <= drop_ratio <= 1, \
-            f'invalid drop_ratio value {drop_ratio}'
-        self.drop_ratio = drop_ratio
-
-    def __call__(self, input_dict):
-        """Call function to drop point colors.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after color dropping,
-                'points' key is updated in the result dict.
-        """
-        points = input_dict['points']
-        assert points.attribute_dims is not None and \
-            'color' in points.attribute_dims, \
-            'Expect points have color attribute'
-
-        # this if-expression is a bit strange
-        # `RandomDropPointsColor` is used in training 3D segmentor PAConv
-        # we discovered in our experiments that, using
-        # `if np.random.rand() > 1.0 - self.drop_ratio` consistently leads to
-        # better results than using `if np.random.rand() < self.drop_ratio`
-        # so we keep this hack in our codebase
-        if np.random.rand() > 1.0 - self.drop_ratio:
-            points.color = points.color * 0.0
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(drop_ratio={self.drop_ratio})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class RandomFlip3D(RandomFlip):
-    """Flip the points & bbox.
-
-    If the input dict contains the key "flip", then the flag will be used,
-    otherwise it will be randomly decided by a ratio specified in the init
-    method.
-
-    Args:
-        sync_2d (bool, optional): Whether to apply flip according to the 2D
-            images. If True, it will apply the same flip as that to 2D images.
-            If False, it will decide whether to flip randomly and independently
-            to that of 2D images. Defaults to True.
-        flip_ratio_bev_horizontal (float, optional): The flipping probability
-            in horizontal direction. Defaults to 0.0.
-        flip_ratio_bev_vertical (float, optional): The flipping probability
-            in vertical direction. Defaults to 0.0.
-    """
-
-    def __init__(self,
-                 sync_2d=True,
-                 flip_ratio_bev_horizontal=0.0,
-                 flip_ratio_bev_vertical=0.0,
-                 **kwargs):
-        super(RandomFlip3D, self).__init__(
-            flip_ratio=flip_ratio_bev_horizontal, **kwargs)
-        self.sync_2d = sync_2d
-        self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
-        if flip_ratio_bev_horizontal is not None:
-            assert isinstance(
-                flip_ratio_bev_horizontal,
-                (int, float)) and 0 <= flip_ratio_bev_horizontal <= 1
-        if flip_ratio_bev_vertical is not None:
-            assert isinstance(
-                flip_ratio_bev_vertical,
-                (int, float)) and 0 <= flip_ratio_bev_vertical <= 1
-
-    def random_flip_data_3d(self, input_dict, direction='horizontal'):
-        """Flip 3D data randomly.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-            direction (str, optional): Flip direction.
-                Default: 'horizontal'.
-
-        Returns:
-            dict: Flipped results, 'points', 'bbox3d_fields' keys are
-                updated in the result dict.
-        """
-        assert direction in ['horizontal', 'vertical']
-        # for semantic segmentation task, only points will be flipped.
-        if 'bbox3d_fields' not in input_dict:
-            input_dict['points'].flip(direction)
-            return
-        if len(input_dict['bbox3d_fields']) == 0:  # test mode
-            input_dict['bbox3d_fields'].append('empty_box3d')
-            input_dict['empty_box3d'] = input_dict['box_type_3d'](
-                np.array([], dtype=np.float32))
-        assert len(input_dict['bbox3d_fields']) == 1
-        for key in input_dict['bbox3d_fields']:
-            if 'points' in input_dict:
-                input_dict['points'] = input_dict[key].flip(
-                    direction, points=input_dict['points'])
-            else:
-                input_dict[key].flip(direction)
-        if 'centers2d' in input_dict:
-            assert self.sync_2d is True and direction == 'horizontal', \
-                'Only support sync_2d=True and horizontal flip with images'
-            w = input_dict['ori_shape'][1]
-            input_dict['centers2d'][..., 0] = \
-                w - input_dict['centers2d'][..., 0]
-            # need to modify the horizontal position of camera center
-            # along u-axis in the image (flip like centers2d)
-            # ['cam2img'][0][2] = c_u
-            # see more details and examples at
-            # https://github.com/open-mmlab/mmdetection3d/pull/744
-            input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2]
-
-    def __call__(self, input_dict):
-        """Call function to flip points, values in the ``bbox3d_fields`` and
-        also flip 2D image and its annotations.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Flipped results, 'flip', 'flip_direction',
-                'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added
-                into result dict.
-        """
-        # flip 2D image and its annotations
-        super(RandomFlip3D, self).__call__(input_dict)
-
-        if self.sync_2d:
-            input_dict['pcd_horizontal_flip'] = input_dict['flip']
-            input_dict['pcd_vertical_flip'] = False
-        else:
-            if 'pcd_horizontal_flip' not in input_dict:
-                flip_horizontal = True if np.random.rand(
-                ) < self.flip_ratio else False
-                input_dict['pcd_horizontal_flip'] = flip_horizontal
-            if 'pcd_vertical_flip' not in input_dict:
-                flip_vertical = True if np.random.rand(
-                ) < self.flip_ratio_bev_vertical else False
-                input_dict['pcd_vertical_flip'] = flip_vertical
-
-        if 'transformation_3d_flow' not in input_dict:
-            input_dict['transformation_3d_flow'] = []
-
-        if input_dict['pcd_horizontal_flip']:
-            self.random_flip_data_3d(input_dict, 'horizontal')
-            input_dict['transformation_3d_flow'].extend(['HF'])
-        if input_dict['pcd_vertical_flip']:
-            self.random_flip_data_3d(input_dict, 'vertical')
-            input_dict['transformation_3d_flow'].extend(['VF'])
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(sync_2d={self.sync_2d},'
-        repr_str += f' flip_ratio_bev_vertical={self.flip_ratio_bev_vertical})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class MultiViewWrapper(object):
-    """Wrap transformation from single-view into multi-view.
-
-    The wrapper processes the images from multi-view one by one. For each
-    image, it constructs a pseudo dict according to the keys specified by the
-    'process_fields' parameter. After the transformation is finished, desired
-    information can be collected by specifying the keys in the 'collected_keys'
-    parameter. Multi-view images share the same transformation parameters
-    but do not share the same magnitude when a random transformation is
-    conducted.
-
-    Args:
-        transforms (list[dict]): A list of dict specifying the transformations
-            for the monocular situation.
-        process_fields (dict): Desired keys that the transformations should
-            be conducted on. Default to dict(img_fields=['img']).
-        collected_keys (list[str]): Collect information in transformation
-            like rotate angles, crop roi, and flip state.
-    """
-
-    def __init__(self,
-                 transforms,
-                 process_fields=dict(img_fields=['img']),
-                 collected_keys=[]):
-        self.transform = Compose(transforms)
-        self.collected_keys = collected_keys
-        self.process_fields = process_fields
-
-    def __call__(self, input_dict):
-        for key in self.collected_keys:
-            input_dict[key] = []
-        for img_id in range(len(input_dict['img'])):
-            process_dict = self.process_fields.copy()
-            for field in self.process_fields:
-                for key in self.process_fields[field]:
-                    process_dict[key] = input_dict[key][img_id]
-            process_dict = self.transform(process_dict)
-            for field in self.process_fields:
-                for key in self.process_fields[field]:
-                    input_dict[key][img_id] = process_dict[key]
-            for key in self.collected_keys:
-                input_dict[key].append(process_dict[key])
-        return input_dict
-
-
-@PIPELINES.register_module()
-class RangeLimitedRandomCrop(RandomCrop):
-    """Randomly crop image-view objects under a limitation of range.
-
-    Args:
-        relative_x_offset_range (tuple[float]): Relative range of random crop
-            in x direction. (x_min, x_max) in [0, 1.0]. Default to (0.0, 1.0).
-        relative_y_offset_range (tuple[float]): Relative range of random crop
-            in y direction. (y_min, y_max) in [0, 1.0]. Default to (0.0, 1.0).
-    """
-
-    def __init__(self,
-                 relative_x_offset_range=(0.0, 1.0),
-                 relative_y_offset_range=(0.0, 1.0),
-                 **kwargs):
-        super(RangeLimitedRandomCrop, self).__init__(**kwargs)
-        for range in [relative_x_offset_range, relative_y_offset_range]:
-            assert 0 <= range[0] <= range[1] <= 1
-        self.relative_x_offset_range = relative_x_offset_range
-        self.relative_y_offset_range = relative_y_offset_range
-
-    def _crop_data(self, results, crop_size, allow_negative_crop):
-        """Function to randomly crop images.
-
-        Modified from RandomCrop in mmdet==2.25.0
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-            crop_size (tuple): Expected absolute size after cropping, (h, w).
-
-        Returns:
-            dict: Randomly cropped results, 'img_shape' key in result dict is
-                updated according to crop size.
-        """
-        assert crop_size[0] > 0 and crop_size[1] > 0
-        for key in results.get('img_fields', ['img']):
-            img = results[key]
-            margin_h = max(img.shape[0] - crop_size[0], 0)
-            margin_w = max(img.shape[1] - crop_size[1], 0)
-            offset_range_h = (margin_h * self.relative_y_offset_range[0],
-                              margin_h * self.relative_y_offset_range[1] + 1)
-            offset_h = np.random.randint(*offset_range_h)
-            offset_range_w = (margin_w * self.relative_x_offset_range[0],
-                              margin_w * self.relative_x_offset_range[1] + 1)
-            offset_w = np.random.randint(*offset_range_w)
-            crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
-            crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
-
-            # crop the image
-            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
-            img_shape = img.shape
-            results[key] = img
-            results['crop'] = (crop_x1, crop_y1, crop_x2, crop_y2)
-        results['img_shape'] = img_shape
-
-        # crop bboxes accordingly and clip to the image boundary
-        for key in results.get('bbox_fields', []):
-            # e.g. gt_bboxes and gt_bboxes_ignore
-            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
-                                   dtype=np.float32)
-            bboxes = results[key] - bbox_offset
-            if self.bbox_clip_border:
-                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
-                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
-            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
-                bboxes[:, 3] > bboxes[:, 1])
-            # If the crop does not contain any gt-bbox area and
-            # allow_negative_crop is False, skip this image.
-            if (key == 'gt_bboxes' and not valid_inds.any()
-                    and not allow_negative_crop):
-                return None
-            results[key] = bboxes[valid_inds, :]
-            # label fields. e.g. gt_labels and gt_labels_ignore
-            label_key = self.bbox2label.get(key)
-            if label_key in results:
-                results[label_key] = results[label_key][valid_inds]
-
-            # mask fields, e.g. gt_masks and gt_masks_ignore
-            mask_key = self.bbox2mask.get(key)
-            if mask_key in results:
-                results[mask_key] = results[mask_key][
-                    valid_inds.nonzero()[0]].crop(
-                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
-                if self.recompute_bbox:
-                    results[key] = results[mask_key].get_bboxes()
-
-        # crop semantic seg
-        for key in results.get('seg_fields', []):
-            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
-
-        return results
-
-
-@PIPELINES.register_module()
-class RandomRotate(Rotate):
-    """Randomly rotate images.
-
-    The ratation angle is selected uniformly within the interval specified by
-    the 'range'  parameter.
-
-    Args:
-        range (tuple[float]): Define the range of random rotation.
-            (angle_min, angle_max) in angle.
-    """
-
-    def __init__(self, range, **kwargs):
-        super(RandomRotate, self).__init__(**kwargs)
-        self.range = range
-
-    def __call__(self, results):
-        self.angle = np.random.uniform(self.range[0], self.range[1])
-        super(RandomRotate, self).__call__(results)
-        results['rotate'] = self.angle
-        return results
-
-
-@PIPELINES.register_module()
-class RandomJitterPoints(object):
-    """Randomly jitter point coordinates.
-
-    Different from the global translation in ``GlobalRotScaleTrans``, here we
-        apply different noises to each point in a scene.
-
-    Args:
-        jitter_std (list[float]): The standard deviation of jittering noise.
-            This applies random noise to all points in a 3D scene, which is
-            sampled from a gaussian distribution whose standard deviation is
-            set by ``jitter_std``. Defaults to [0.01, 0.01, 0.01]
-        clip_range (list[float]): Clip the randomly generated jitter
-            noise into this range. If None is given, don't perform clipping.
-            Defaults to [-0.05, 0.05]
-
-    Note:
-        This transform should only be used in point cloud segmentation tasks
-            because we don't transform ground-truth bboxes accordingly.
-        For similar transform in detection task, please refer to `ObjectNoise`.
-    """
-
-    def __init__(self,
-                 jitter_std=[0.01, 0.01, 0.01],
-                 clip_range=[-0.05, 0.05]):
-        seq_types = (list, tuple, np.ndarray)
-        if not isinstance(jitter_std, seq_types):
-            assert isinstance(jitter_std, (int, float)), \
-                f'unsupported jitter_std type {type(jitter_std)}'
-            jitter_std = [jitter_std, jitter_std, jitter_std]
-        self.jitter_std = jitter_std
-
-        if clip_range is not None:
-            if not isinstance(clip_range, seq_types):
-                assert isinstance(clip_range, (int, float)), \
-                    f'unsupported clip_range type {type(clip_range)}'
-                clip_range = [-clip_range, clip_range]
-        self.clip_range = clip_range
-
-    def __call__(self, input_dict):
-        """Call function to jitter all the points in the scene.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after adding noise to each point,
-                'points' key is updated in the result dict.
-        """
-        points = input_dict['points']
-        jitter_std = np.array(self.jitter_std, dtype=np.float32)
-        jitter_noise = \
-            np.random.randn(points.shape[0], 3) * jitter_std[None, :]
-        if self.clip_range is not None:
-            jitter_noise = np.clip(jitter_noise, self.clip_range[0],
-                                   self.clip_range[1])
-
-        points.translate(jitter_noise)
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(jitter_std={self.jitter_std},'
-        repr_str += f' clip_range={self.clip_range})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class ObjectSample(object):
-    """Sample GT objects to the data.
-
-    Args:
-        db_sampler (dict): Config dict of the database sampler.
-        sample_2d (bool): Whether to also paste 2D image patch to the images
-            This should be true when applying multi-modality cut-and-paste.
-            Defaults to False.
-        use_ground_plane (bool): Whether to use gound plane to adjust the
-            3D labels.
-    """
-
-    def __init__(self, db_sampler, sample_2d=False, use_ground_plane=False):
-        self.sampler_cfg = db_sampler
-        self.sample_2d = sample_2d
-        if 'type' not in db_sampler.keys():
-            db_sampler['type'] = 'DataBaseSampler'
-        self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS)
-        self.use_ground_plane = use_ground_plane
-
-    @staticmethod
-    def remove_points_in_boxes(points, boxes):
-        """Remove the points in the sampled bounding boxes.
-
-        Args:
-            points (:obj:`BasePoints`): Input point cloud array.
-            boxes (np.ndarray): Sampled ground truth boxes.
-
-        Returns:
-            np.ndarray: Points with those in the boxes removed.
-        """
-        masks = box_np_ops.points_in_rbbox(points.coord.numpy(), boxes)
-        points = points[np.logical_not(masks.any(-1))]
-        return points
-
-    def __call__(self, input_dict):
-        """Call function to sample ground truth objects to the data.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after object sampling augmentation,
-                'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated
-                in the result dict.
-        """
-        gt_bboxes_3d = input_dict['gt_bboxes_3d']
-        gt_labels_3d = input_dict['gt_labels_3d']
-
-        if self.use_ground_plane and 'plane' in input_dict['ann_info']:
-            ground_plane = input_dict['ann_info']['plane']
-            input_dict['plane'] = ground_plane
-        else:
-            ground_plane = None
-        # change to float for blending operation
-        points = input_dict['points']
-        if self.sample_2d:
-            img = input_dict['img']
-            gt_bboxes_2d = input_dict['gt_bboxes']
-            # Assume for now 3D & 2D bboxes are the same
-            sampled_dict = self.db_sampler.sample_all(
-                gt_bboxes_3d.tensor.numpy(),
-                gt_labels_3d,
-                gt_bboxes_2d=gt_bboxes_2d,
-                img=img)
-        else:
-            sampled_dict = self.db_sampler.sample_all(
-                gt_bboxes_3d.tensor.numpy(),
-                gt_labels_3d,
-                img=None,
-                ground_plane=ground_plane)
-
-        if sampled_dict is not None:
-            sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d']
-            sampled_points = sampled_dict['points']
-            sampled_gt_labels = sampled_dict['gt_labels_3d']
-
-            gt_labels_3d = np.concatenate([gt_labels_3d, sampled_gt_labels],
-                                          axis=0)
-            gt_bboxes_3d = gt_bboxes_3d.new_box(
-                np.concatenate(
-                    [gt_bboxes_3d.tensor.numpy(), sampled_gt_bboxes_3d]))
-
-            points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d)
-            # check the points dimension
-            points = points.cat([sampled_points, points])
-
-            if self.sample_2d:
-                sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d']
-                gt_bboxes_2d = np.concatenate(
-                    [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32)
-
-                input_dict['gt_bboxes'] = gt_bboxes_2d
-                input_dict['img'] = sampled_dict['img']
-
-        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
-        input_dict['gt_labels_3d'] = gt_labels_3d.astype(np.int64)
-        input_dict['points'] = points
-
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f' sample_2d={self.sample_2d},'
-        repr_str += f' data_root={self.sampler_cfg.data_root},'
-        repr_str += f' info_path={self.sampler_cfg.info_path},'
-        repr_str += f' rate={self.sampler_cfg.rate},'
-        repr_str += f' prepare={self.sampler_cfg.prepare},'
-        repr_str += f' classes={self.sampler_cfg.classes},'
-        repr_str += f' sample_groups={self.sampler_cfg.sample_groups}'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class ObjectNoise(object):
-    """Apply noise to each GT objects in the scene.
-
-    Args:
-        translation_std (list[float], optional): Standard deviation of the
-            distribution where translation noise are sampled from.
-            Defaults to [0.25, 0.25, 0.25].
-        global_rot_range (list[float], optional): Global rotation to the scene.
-            Defaults to [0.0, 0.0].
-        rot_range (list[float], optional): Object rotation range.
-            Defaults to [-0.15707963267, 0.15707963267].
-        num_try (int, optional): Number of times to try if the noise applied is
-            invalid. Defaults to 100.
-    """
-
-    def __init__(self,
-                 translation_std=[0.25, 0.25, 0.25],
-                 global_rot_range=[0.0, 0.0],
-                 rot_range=[-0.15707963267, 0.15707963267],
-                 num_try=100):
-        self.translation_std = translation_std
-        self.global_rot_range = global_rot_range
-        self.rot_range = rot_range
-        self.num_try = num_try
-
-    def __call__(self, input_dict):
-        """Call function to apply noise to each ground truth in the scene.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after adding noise to each object,
-                'points', 'gt_bboxes_3d' keys are updated in the result dict.
-        """
-        gt_bboxes_3d = input_dict['gt_bboxes_3d']
-        points = input_dict['points']
-
-        # TODO: this is inplace operation
-        numpy_box = gt_bboxes_3d.tensor.numpy()
-        numpy_points = points.tensor.numpy()
-
-        noise_per_object_v3_(
-            numpy_box,
-            numpy_points,
-            rotation_perturb=self.rot_range,
-            center_noise_std=self.translation_std,
-            global_random_rot_range=self.global_rot_range,
-            num_try=self.num_try)
-
-        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.new_box(numpy_box)
-        input_dict['points'] = points.new_point(numpy_points)
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(num_try={self.num_try},'
-        repr_str += f' translation_std={self.translation_std},'
-        repr_str += f' global_rot_range={self.global_rot_range},'
-        repr_str += f' rot_range={self.rot_range})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class GlobalAlignment(object):
-    """Apply global alignment to 3D scene points by rotation and translation.
-
-    Args:
-        rotation_axis (int): Rotation axis for points and bboxes rotation.
-
-    Note:
-        We do not record the applied rotation and translation as in
-            GlobalRotScaleTrans. Because usually, we do not need to reverse
-            the alignment step.
-        For example, ScanNet 3D detection task uses aligned ground-truth
-            bounding boxes for evaluation.
-    """
-
-    def __init__(self, rotation_axis):
-        self.rotation_axis = rotation_axis
-
-    def _trans_points(self, input_dict, trans_factor):
-        """Private function to translate points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-            trans_factor (np.ndarray): Translation vector to be applied.
-
-        Returns:
-            dict: Results after translation, 'points' is updated in the dict.
-        """
-        input_dict['points'].translate(trans_factor)
-
-    def _rot_points(self, input_dict, rot_mat):
-        """Private function to rotate bounding boxes and points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-            rot_mat (np.ndarray): Rotation matrix to be applied.
-
-        Returns:
-            dict: Results after rotation, 'points' is updated in the dict.
-        """
-        # input should be rot_mat_T so I transpose it here
-        input_dict['points'].rotate(rot_mat.T)
-
-    def _check_rot_mat(self, rot_mat):
-        """Check if rotation matrix is valid for self.rotation_axis.
-
-        Args:
-            rot_mat (np.ndarray): Rotation matrix to be applied.
-        """
-        is_valid = np.allclose(np.linalg.det(rot_mat), 1.0)
-        valid_array = np.zeros(3)
-        valid_array[self.rotation_axis] = 1.0
-        is_valid &= (rot_mat[self.rotation_axis, :] == valid_array).all()
-        is_valid &= (rot_mat[:, self.rotation_axis] == valid_array).all()
-        assert is_valid, f'invalid rotation matrix {rot_mat}'
-
-    def __call__(self, input_dict):
-        """Call function to shuffle points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after global alignment, 'points' and keys in
-                input_dict['bbox3d_fields'] are updated in the result dict.
-        """
-        assert 'axis_align_matrix' in input_dict['ann_info'].keys(), \
-            'axis_align_matrix is not provided in GlobalAlignment'
-
-        axis_align_matrix = input_dict['ann_info']['axis_align_matrix']
-        assert axis_align_matrix.shape == (4, 4), \
-            f'invalid shape {axis_align_matrix.shape} for axis_align_matrix'
-        rot_mat = axis_align_matrix[:3, :3]
-        trans_vec = axis_align_matrix[:3, -1]
-
-        self._check_rot_mat(rot_mat)
-        self._rot_points(input_dict, rot_mat)
-        self._trans_points(input_dict, trans_vec)
-
-        return input_dict
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(rotation_axis={self.rotation_axis})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class GlobalRotScaleTrans(object):
-    """Apply global rotation, scaling and translation to a 3D scene.
-
-    Args:
-        rot_range (list[float], optional): Range of rotation angle.
-            Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
-        scale_ratio_range (list[float], optional): Range of scale ratio.
-            Defaults to [0.95, 1.05].
-        translation_std (list[float], optional): The standard deviation of
-            translation noise applied to a scene, which
-            is sampled from a gaussian distribution whose standard deviation
-            is set by ``translation_std``. Defaults to [0, 0, 0]
-        shift_height (bool, optional): Whether to shift height.
-            (the fourth dimension of indoor points) when scaling.
-            Defaults to False.
-    """
-
-    def __init__(self,
-                 rot_range=[-0.78539816, 0.78539816],
-                 scale_ratio_range=[0.95, 1.05],
-                 translation_std=[0, 0, 0],
-                 shift_height=False):
-        seq_types = (list, tuple, np.ndarray)
-        if not isinstance(rot_range, seq_types):
-            assert isinstance(rot_range, (int, float)), \
-                f'unsupported rot_range type {type(rot_range)}'
-            rot_range = [-rot_range, rot_range]
-        self.rot_range = rot_range
-
-        assert isinstance(scale_ratio_range, seq_types), \
-            f'unsupported scale_ratio_range type {type(scale_ratio_range)}'
-        self.scale_ratio_range = scale_ratio_range
-
-        if not isinstance(translation_std, seq_types):
-            assert isinstance(translation_std, (int, float)), \
-                f'unsupported translation_std type {type(translation_std)}'
-            translation_std = [
-                translation_std, translation_std, translation_std
-            ]
-        assert all([std >= 0 for std in translation_std]), \
-            'translation_std should be positive'
-        self.translation_std = translation_std
-        self.shift_height = shift_height
-
-    def _trans_bbox_points(self, input_dict):
-        """Private function to translate bounding boxes and points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after translation, 'points', 'pcd_trans'
-                and keys in input_dict['bbox3d_fields'] are updated
-                in the result dict.
-        """
-        translation_std = np.array(self.translation_std, dtype=np.float32)
-        trans_factor = np.random.normal(scale=translation_std, size=3).T
-
-        input_dict['points'].translate(trans_factor)
-        input_dict['pcd_trans'] = trans_factor
-        for key in input_dict['bbox3d_fields']:
-            input_dict[key].translate(trans_factor)
-
-    def _rot_bbox_points(self, input_dict):
-        """Private function to rotate bounding boxes and points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after rotation, 'points', 'pcd_rotation'
-                and keys in input_dict['bbox3d_fields'] are updated
-                in the result dict.
-        """
-        rotation = self.rot_range
-        noise_rotation = np.random.uniform(rotation[0], rotation[1])
-
-        # if no bbox in input_dict, only rotate points
-        if len(input_dict['bbox3d_fields']) == 0:
-            rot_mat_T = input_dict['points'].rotate(noise_rotation)
-            input_dict['pcd_rotation'] = rot_mat_T
-            input_dict['pcd_rotation_angle'] = noise_rotation
-            return
-
-        # rotate points with bboxes
-        for key in input_dict['bbox3d_fields']:
-            if len(input_dict[key].tensor) != 0:
-                points, rot_mat_T = input_dict[key].rotate(
-                    noise_rotation, input_dict['points'])
-                input_dict['points'] = points
-                input_dict['pcd_rotation'] = rot_mat_T
-                input_dict['pcd_rotation_angle'] = noise_rotation
-
-    def _scale_bbox_points(self, input_dict):
-        """Private function to scale bounding boxes and points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after scaling, 'points'and keys in
-                input_dict['bbox3d_fields'] are updated in the result dict.
-        """
-        scale = input_dict['pcd_scale_factor']
-        points = input_dict['points']
-        points.scale(scale)
-        if self.shift_height:
-            assert 'height' in points.attribute_dims.keys(), \
-                'setting shift_height=True but points have no height attribute'
-            points.tensor[:, points.attribute_dims['height']] *= scale
-        input_dict['points'] = points
-
-        for key in input_dict['bbox3d_fields']:
-            input_dict[key].scale(scale)
-
-    def _random_scale(self, input_dict):
-        """Private function to randomly set the scale factor.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after scaling, 'pcd_scale_factor' are updated
-                in the result dict.
-        """
-        scale_factor = np.random.uniform(self.scale_ratio_range[0],
-                                         self.scale_ratio_range[1])
-        input_dict['pcd_scale_factor'] = scale_factor
-
-    def __call__(self, input_dict):
-        """Private function to rotate, scale and translate bounding boxes and
-        points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after scaling, 'points', 'pcd_rotation',
-                'pcd_scale_factor', 'pcd_trans' and keys in
-                input_dict['bbox3d_fields'] are updated in the result dict.
-        """
-        if 'transformation_3d_flow' not in input_dict:
-            input_dict['transformation_3d_flow'] = []
-
-        self._rot_bbox_points(input_dict)
-
-        if 'pcd_scale_factor' not in input_dict:
-            self._random_scale(input_dict)
-        self._scale_bbox_points(input_dict)
-
-        self._trans_bbox_points(input_dict)
-
-        input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(rot_range={self.rot_range},'
-        repr_str += f' scale_ratio_range={self.scale_ratio_range},'
-        repr_str += f' translation_std={self.translation_std},'
-        repr_str += f' shift_height={self.shift_height})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class PointShuffle(object):
-    """Shuffle input points."""
-
-    def __call__(self, input_dict):
-        """Call function to shuffle points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after filtering, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        idx = input_dict['points'].shuffle()
-        idx = idx.numpy()
-
-        pts_instance_mask = input_dict.get('pts_instance_mask', None)
-        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
-
-        if pts_instance_mask is not None:
-            input_dict['pts_instance_mask'] = pts_instance_mask[idx]
-
-        if pts_semantic_mask is not None:
-            input_dict['pts_semantic_mask'] = pts_semantic_mask[idx]
-
-        return input_dict
-
-    def __repr__(self):
-        return self.__class__.__name__
-
-
-@PIPELINES.register_module()
-class ObjectRangeFilter(object):
-    """Filter objects by the range.
-
-    Args:
-        point_cloud_range (list[float]): Point cloud range.
-    """
-
-    def __init__(self, point_cloud_range):
-        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
-
-    def __call__(self, input_dict):
-        """Call function to filter objects by the range.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
-                keys are updated in the result dict.
-        """
-        # Check points instance type and initialise bev_range
-        if isinstance(input_dict['gt_bboxes_3d'],
-                      (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
-            bev_range = self.pcd_range[[0, 1, 3, 4]]
-        elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
-            bev_range = self.pcd_range[[0, 2, 3, 5]]
-
-        gt_bboxes_3d = input_dict['gt_bboxes_3d']
-        gt_labels_3d = input_dict['gt_labels_3d']
-        mask = gt_bboxes_3d.in_range_bev(bev_range)
-        gt_bboxes_3d = gt_bboxes_3d[mask]
-        # mask is a torch tensor but gt_labels_3d is still numpy array
-        # using mask to index gt_labels_3d will cause bug when
-        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
-        # as gt_labels_3d[1] and cause out of index error
-        gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
-
-        # limit rad to [-pi, pi]
-        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
-        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
-        input_dict['gt_labels_3d'] = gt_labels_3d
-
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class PointsRangeFilter(object):
-    """Filter points by the range.
-
-    Args:
-        point_cloud_range (list[float]): Point cloud range.
-    """
-
-    def __init__(self, point_cloud_range):
-        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
-
-    def __call__(self, input_dict):
-        """Call function to filter points by the range.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after filtering, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        points = input_dict['points']
-        points_mask = points.in_range_3d(self.pcd_range)
-        clean_points = points[points_mask]
-        input_dict['points'] = clean_points
-        points_mask = points_mask.numpy()
-
-        pts_instance_mask = input_dict.get('pts_instance_mask', None)
-        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
-
-        if pts_instance_mask is not None:
-            input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
-
-        if pts_semantic_mask is not None:
-            input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
-
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class ObjectNameFilter(object):
-    """Filter GT objects by their names.
-
-    Args:
-        classes (list[str]): List of class names to be kept for training.
-    """
-
-    def __init__(self, classes):
-        self.classes = classes
-        self.labels = list(range(len(self.classes)))
-
-    def __call__(self, input_dict):
-        """Call function to filter objects by their names.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
-                keys are updated in the result dict.
-        """
-        gt_labels_3d = input_dict['gt_labels_3d']
-        gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
-                                  dtype=np.bool_)
-        input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
-        input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
-
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(classes={self.classes})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class PointSample(object):
-    """Point sample.
-
-    Sampling data to a certain number.
-
-    Args:
-        num_points (int): Number of points to be sampled.
-        sample_range (float, optional): The range where to sample points.
-            If not None, the points with depth larger than `sample_range` are
-            prior to be sampled. Defaults to None.
-        replace (bool, optional): Whether the sampling is with or without
-            replacement. Defaults to False.
-    """
-
-    def __init__(self, num_points, sample_range=None, replace=False):
-        self.num_points = num_points
-        self.sample_range = sample_range
-        self.replace = replace
-
-    def _points_random_sampling(self,
-                                points,
-                                num_samples,
-                                sample_range=None,
-                                replace=False,
-                                return_choices=False):
-        """Points random sampling.
-
-        Sample points to a certain number.
-
-        Args:
-            points (np.ndarray | :obj:`BasePoints`): 3D Points.
-            num_samples (int): Number of samples to be sampled.
-            sample_range (float, optional): Indicating the range where the
-                points will be sampled. Defaults to None.
-            replace (bool, optional): Sampling with or without replacement.
-                Defaults to None.
-            return_choices (bool, optional): Whether return choice.
-                Defaults to False.
-        Returns:
-            tuple[np.ndarray] | np.ndarray:
-                - points (np.ndarray | :obj:`BasePoints`): 3D Points.
-                - choices (np.ndarray, optional): The generated random samples.
-        """
-        if not replace:
-            replace = (points.shape[0] < num_samples)
-        point_range = range(len(points))
-        if sample_range is not None and not replace:
-            # Only sampling the near points when len(points) >= num_samples
-            dist = np.linalg.norm(points.tensor, axis=1)
-            far_inds = np.where(dist >= sample_range)[0]
-            near_inds = np.where(dist < sample_range)[0]
-            # in case there are too many far points
-            if len(far_inds) > num_samples:
-                far_inds = np.random.choice(
-                    far_inds, num_samples, replace=False)
-            point_range = near_inds
-            num_samples -= len(far_inds)
-        choices = np.random.choice(point_range, num_samples, replace=replace)
-        if sample_range is not None and not replace:
-            choices = np.concatenate((far_inds, choices))
-            # Shuffle points after sampling
-            np.random.shuffle(choices)
-        if return_choices:
-            return points[choices], choices
-        else:
-            return points[choices]
-
-    def __call__(self, results):
-        """Call function to sample points to in indoor scenes.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Results after sampling, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        points = results['points']
-        points, choices = self._points_random_sampling(
-            points,
-            self.num_points,
-            self.sample_range,
-            self.replace,
-            return_choices=True)
-        results['points'] = points
-
-        pts_instance_mask = results.get('pts_instance_mask', None)
-        pts_semantic_mask = results.get('pts_semantic_mask', None)
-
-        if pts_instance_mask is not None:
-            pts_instance_mask = pts_instance_mask[choices]
-            results['pts_instance_mask'] = pts_instance_mask
-
-        if pts_semantic_mask is not None:
-            pts_semantic_mask = pts_semantic_mask[choices]
-            results['pts_semantic_mask'] = pts_semantic_mask
-
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(num_points={self.num_points},'
-        repr_str += f' sample_range={self.sample_range},'
-        repr_str += f' replace={self.replace})'
-
-        return repr_str
-
-
-@PIPELINES.register_module()
-class IndoorPointSample(PointSample):
-    """Indoor point sample.
-
-    Sampling data to a certain number.
-    NOTE: IndoorPointSample is deprecated in favor of PointSample
-
-    Args:
-        num_points (int): Number of points to be sampled.
-    """
-
-    def __init__(self, *args, **kwargs):
-        warnings.warn(
-            'IndoorPointSample is deprecated in favor of PointSample')
-        super(IndoorPointSample, self).__init__(*args, **kwargs)
-
-
-@PIPELINES.register_module()
-class IndoorPatchPointSample(object):
-    r"""Indoor point sample within a patch. Modified from `PointNet++ <https://
-    github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py>`_.
-
-    Sampling data to a certain number for semantic segmentation.
-
-    Args:
-        num_points (int): Number of points to be sampled.
-        block_size (float, optional): Size of a block to sample points from.
-            Defaults to 1.5.
-        sample_rate (float, optional): Stride used in sliding patch generation.
-            This parameter is unused in `IndoorPatchPointSample` and thus has
-            been deprecated. We plan to remove it in the future.
-            Defaults to None.
-        ignore_index (int, optional): Label index that won't be used for the
-            segmentation task. This is set in PointSegClassMapping as neg_cls.
-            If not None, will be used as a patch selection criterion.
-            Defaults to None.
-        use_normalized_coord (bool, optional): Whether to use normalized xyz as
-            additional features. Defaults to False.
-        num_try (int, optional): Number of times to try if the patch selected
-            is invalid. Defaults to 10.
-        enlarge_size (float, optional): Enlarge the sampled patch to
-            [-block_size / 2 - enlarge_size, block_size / 2 + enlarge_size] as
-            an augmentation. If None, set it as 0. Defaults to 0.2.
-        min_unique_num (int, optional): Minimum number of unique points
-            the sampled patch should contain. If None, use PointNet++'s method
-            to judge uniqueness. Defaults to None.
-        eps (float, optional): A value added to patch boundary to guarantee
-            points coverage. Defaults to 1e-2.
-
-    Note:
-        This transform should only be used in the training process of point
-            cloud segmentation tasks. For the sliding patch generation and
-            inference process in testing, please refer to the `slide_inference`
-            function of `EncoderDecoder3D` class.
-    """
-
-    def __init__(self,
-                 num_points,
-                 block_size=1.5,
-                 sample_rate=None,
-                 ignore_index=None,
-                 use_normalized_coord=False,
-                 num_try=10,
-                 enlarge_size=0.2,
-                 min_unique_num=None,
-                 eps=1e-2):
-        self.num_points = num_points
-        self.block_size = block_size
-        self.ignore_index = ignore_index
-        self.use_normalized_coord = use_normalized_coord
-        self.num_try = num_try
-        self.enlarge_size = enlarge_size if enlarge_size is not None else 0.0
-        self.min_unique_num = min_unique_num
-        self.eps = eps
-
-        if sample_rate is not None:
-            warnings.warn(
-                "'sample_rate' has been deprecated and will be removed in "
-                'the future. Please remove them from your code.')
-
-    def _input_generation(self, coords, patch_center, coord_max, attributes,
-                          attribute_dims, point_type):
-        """Generating model input.
-
-        Generate input by subtracting patch center and adding additional
-            features. Currently support colors and normalized xyz as features.
-
-        Args:
-            coords (np.ndarray): Sampled 3D Points.
-            patch_center (np.ndarray): Center coordinate of the selected patch.
-            coord_max (np.ndarray): Max coordinate of all 3D Points.
-            attributes (np.ndarray): features of input points.
-            attribute_dims (dict): Dictionary to indicate the meaning of extra
-                dimension.
-            point_type (type): class of input points inherited from BasePoints.
-
-        Returns:
-            :obj:`BasePoints`: The generated input data.
-        """
-        # subtract patch center, the z dimension is not centered
-        centered_coords = coords.copy()
-        centered_coords[:, 0] -= patch_center[0]
-        centered_coords[:, 1] -= patch_center[1]
-
-        if self.use_normalized_coord:
-            normalized_coord = coords / coord_max
-            attributes = np.concatenate([attributes, normalized_coord], axis=1)
-            if attribute_dims is None:
-                attribute_dims = dict()
-            attribute_dims.update(
-                dict(normalized_coord=[
-                    attributes.shape[1], attributes.shape[1] +
-                    1, attributes.shape[1] + 2
-                ]))
-
-        points = np.concatenate([centered_coords, attributes], axis=1)
-        points = point_type(
-            points, points_dim=points.shape[1], attribute_dims=attribute_dims)
-
-        return points
-
-    def _patch_points_sampling(self, points, sem_mask):
-        """Patch points sampling.
-
-        First sample a valid patch.
-        Then sample points within that patch to a certain number.
-
-        Args:
-            points (:obj:`BasePoints`): 3D Points.
-            sem_mask (np.ndarray): semantic segmentation mask for input points.
-
-        Returns:
-            tuple[:obj:`BasePoints`, np.ndarray] | :obj:`BasePoints`:
-
-                - points (:obj:`BasePoints`): 3D Points.
-                - choices (np.ndarray): The generated random samples.
-        """
-        coords = points.coord.numpy()
-        attributes = points.tensor[:, 3:].numpy()
-        attribute_dims = points.attribute_dims
-        point_type = type(points)
-
-        coord_max = np.amax(coords, axis=0)
-        coord_min = np.amin(coords, axis=0)
-
-        for _ in range(self.num_try):
-            # random sample a point as patch center
-            cur_center = coords[np.random.choice(coords.shape[0])]
-
-            # boundary of a patch, which would be enlarged by
-            # `self.enlarge_size` as an augmentation
-            cur_max = cur_center + np.array(
-                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
-            cur_min = cur_center - np.array(
-                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
-            cur_max[2] = coord_max[2]
-            cur_min[2] = coord_min[2]
-            cur_choice = np.sum(
-                (coords >= (cur_min - self.enlarge_size)) *
-                (coords <= (cur_max + self.enlarge_size)),
-                axis=1) == 3
-
-            if not cur_choice.any():  # no points in this patch
-                continue
-
-            cur_coords = coords[cur_choice, :]
-            cur_sem_mask = sem_mask[cur_choice]
-            point_idxs = np.where(cur_choice)[0]
-            mask = np.sum(
-                (cur_coords >= (cur_min - self.eps)) * (cur_coords <=
-                                                        (cur_max + self.eps)),
-                axis=1) == 3
-
-            # two criteria for patch sampling, adopted from PointNet++
-            # 1. selected patch should contain enough unique points
-            if self.min_unique_num is None:
-                # use PointNet++'s method as default
-                # [31, 31, 62] are just some big values used to transform
-                # coords from 3d array to 1d and then check their uniqueness
-                # this is used in all the ScanNet code following PointNet++
-                vidx = np.ceil(
-                    (cur_coords[mask, :] - cur_min) / (cur_max - cur_min) *
-                    np.array([31.0, 31.0, 62.0]))
-                vidx = np.unique(vidx[:, 0] * 31.0 * 62.0 + vidx[:, 1] * 62.0 +
-                                 vidx[:, 2])
-                flag1 = len(vidx) / 31.0 / 31.0 / 62.0 >= 0.02
-            else:
-                # if `min_unique_num` is provided, directly compare with it
-                flag1 = mask.sum() >= self.min_unique_num
-
-            # 2. selected patch should contain enough annotated points
-            if self.ignore_index is None:
-                flag2 = True
-            else:
-                flag2 = np.sum(cur_sem_mask != self.ignore_index) / \
-                               len(cur_sem_mask) >= 0.7
-
-            if flag1 and flag2:
-                break
-
-        # sample idx to `self.num_points`
-        if point_idxs.size >= self.num_points:
-            # no duplicate in sub-sampling
-            choices = np.random.choice(
-                point_idxs, self.num_points, replace=False)
-        else:
-            # do not use random choice here to avoid some points not counted
-            dup = np.random.choice(point_idxs.size,
-                                   self.num_points - point_idxs.size)
-            idx_dup = np.concatenate(
-                [np.arange(point_idxs.size),
-                 np.array(dup)], 0)
-            choices = point_idxs[idx_dup]
-
-        # construct model input
-        points = self._input_generation(coords[choices], cur_center, coord_max,
-                                        attributes[choices], attribute_dims,
-                                        point_type)
-
-        return points, choices
-
-    def __call__(self, results):
-        """Call function to sample points to in indoor scenes.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after sampling, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        points = results['points']
-
-        assert 'pts_semantic_mask' in results.keys(), \
-            'semantic mask should be provided in training and evaluation'
-        pts_semantic_mask = results['pts_semantic_mask']
-
-        points, choices = self._patch_points_sampling(points,
-                                                      pts_semantic_mask)
-
-        results['points'] = points
-        results['pts_semantic_mask'] = pts_semantic_mask[choices]
-        pts_instance_mask = results.get('pts_instance_mask', None)
-        if pts_instance_mask is not None:
-            results['pts_instance_mask'] = pts_instance_mask[choices]
-
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(num_points={self.num_points},'
-        repr_str += f' block_size={self.block_size},'
-        repr_str += f' ignore_index={self.ignore_index},'
-        repr_str += f' use_normalized_coord={self.use_normalized_coord},'
-        repr_str += f' num_try={self.num_try},'
-        repr_str += f' enlarge_size={self.enlarge_size},'
-        repr_str += f' min_unique_num={self.min_unique_num},'
-        repr_str += f' eps={self.eps})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class BackgroundPointsFilter(object):
-    """Filter background points near the bounding box.
-
-    Args:
-        bbox_enlarge_range (tuple[float], float): Bbox enlarge range.
-    """
-
-    def __init__(self, bbox_enlarge_range):
-        assert (is_tuple_of(bbox_enlarge_range, float)
-                and len(bbox_enlarge_range) == 3) \
-            or isinstance(bbox_enlarge_range, float), \
-            f'Invalid arguments bbox_enlarge_range {bbox_enlarge_range}'
-
-        if isinstance(bbox_enlarge_range, float):
-            bbox_enlarge_range = [bbox_enlarge_range] * 3
-        self.bbox_enlarge_range = np.array(
-            bbox_enlarge_range, dtype=np.float32)[np.newaxis, :]
-
-    def __call__(self, input_dict):
-        """Call function to filter points by the range.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after filtering, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        points = input_dict['points']
-        gt_bboxes_3d = input_dict['gt_bboxes_3d']
-
-        # avoid groundtruth being modified
-        gt_bboxes_3d_np = gt_bboxes_3d.tensor.clone().numpy()
-        gt_bboxes_3d_np[:, :3] = gt_bboxes_3d.gravity_center.clone().numpy()
-
-        enlarged_gt_bboxes_3d = gt_bboxes_3d_np.copy()
-        enlarged_gt_bboxes_3d[:, 3:6] += self.bbox_enlarge_range
-        points_numpy = points.tensor.clone().numpy()
-        foreground_masks = box_np_ops.points_in_rbbox(
-            points_numpy, gt_bboxes_3d_np, origin=(0.5, 0.5, 0.5))
-        enlarge_foreground_masks = box_np_ops.points_in_rbbox(
-            points_numpy, enlarged_gt_bboxes_3d, origin=(0.5, 0.5, 0.5))
-        foreground_masks = foreground_masks.max(1)
-        enlarge_foreground_masks = enlarge_foreground_masks.max(1)
-        valid_masks = ~np.logical_and(~foreground_masks,
-                                      enlarge_foreground_masks)
-
-        input_dict['points'] = points[valid_masks]
-        pts_instance_mask = input_dict.get('pts_instance_mask', None)
-        if pts_instance_mask is not None:
-            input_dict['pts_instance_mask'] = pts_instance_mask[valid_masks]
-
-        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
-        if pts_semantic_mask is not None:
-            input_dict['pts_semantic_mask'] = pts_semantic_mask[valid_masks]
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(bbox_enlarge_range={self.bbox_enlarge_range.tolist()})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class VoxelBasedPointSampler(object):
-    """Voxel based point sampler.
-
-    Apply voxel sampling to multiple sweep points.
-
-    Args:
-        cur_sweep_cfg (dict): Config for sampling current points.
-        prev_sweep_cfg (dict): Config for sampling previous points.
-        time_dim (int): Index that indicate the time dimension
-            for input points.
-    """
-
-    def __init__(self, cur_sweep_cfg, prev_sweep_cfg=None, time_dim=3):
-        self.cur_voxel_generator = VoxelGenerator(**cur_sweep_cfg)
-        self.cur_voxel_num = self.cur_voxel_generator._max_voxels
-        self.time_dim = time_dim
-        if prev_sweep_cfg is not None:
-            assert prev_sweep_cfg['max_num_points'] == \
-                cur_sweep_cfg['max_num_points']
-            self.prev_voxel_generator = VoxelGenerator(**prev_sweep_cfg)
-            self.prev_voxel_num = self.prev_voxel_generator._max_voxels
-        else:
-            self.prev_voxel_generator = None
-            self.prev_voxel_num = 0
-
-    def _sample_points(self, points, sampler, point_dim):
-        """Sample points for each points subset.
-
-        Args:
-            points (np.ndarray): Points subset to be sampled.
-            sampler (VoxelGenerator): Voxel based sampler for
-                each points subset.
-            point_dim (int): The dimension of each points
-
-        Returns:
-            np.ndarray: Sampled points.
-        """
-        voxels, coors, num_points_per_voxel = sampler.generate(points)
-        if voxels.shape[0] < sampler._max_voxels:
-            padding_points = np.zeros([
-                sampler._max_voxels - voxels.shape[0], sampler._max_num_points,
-                point_dim
-            ],
-                                      dtype=points.dtype)
-            padding_points[:] = voxels[0]
-            sample_points = np.concatenate([voxels, padding_points], axis=0)
-        else:
-            sample_points = voxels
-
-        return sample_points
-
-    def __call__(self, results):
-        """Call function to sample points from multiple sweeps.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after sampling, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        points = results['points']
-        original_dim = points.shape[1]
-
-        # TODO: process instance and semantic mask while _max_num_points
-        # is larger than 1
-        # Extend points with seg and mask fields
-        map_fields2dim = []
-        start_dim = original_dim
-        points_numpy = points.tensor.numpy()
-        extra_channel = [points_numpy]
-        for idx, key in enumerate(results['pts_mask_fields']):
-            map_fields2dim.append((key, idx + start_dim))
-            extra_channel.append(results[key][..., None])
-
-        start_dim += len(results['pts_mask_fields'])
-        for idx, key in enumerate(results['pts_seg_fields']):
-            map_fields2dim.append((key, idx + start_dim))
-            extra_channel.append(results[key][..., None])
-
-        points_numpy = np.concatenate(extra_channel, axis=-1)
-
-        # Split points into two part, current sweep points and
-        # previous sweeps points.
-        # TODO: support different sampling methods for next sweeps points
-        # and previous sweeps points.
-        cur_points_flag = (points_numpy[:, self.time_dim] == 0)
-        cur_sweep_points = points_numpy[cur_points_flag]
-        prev_sweeps_points = points_numpy[~cur_points_flag]
-        if prev_sweeps_points.shape[0] == 0:
-            prev_sweeps_points = cur_sweep_points
-
-        # Shuffle points before sampling
-        np.random.shuffle(cur_sweep_points)
-        np.random.shuffle(prev_sweeps_points)
-
-        cur_sweep_points = self._sample_points(cur_sweep_points,
-                                               self.cur_voxel_generator,
-                                               points_numpy.shape[1])
-        if self.prev_voxel_generator is not None:
-            prev_sweeps_points = self._sample_points(prev_sweeps_points,
-                                                     self.prev_voxel_generator,
-                                                     points_numpy.shape[1])
-
-            points_numpy = np.concatenate(
-                [cur_sweep_points, prev_sweeps_points], 0)
-        else:
-            points_numpy = cur_sweep_points
-
-        if self.cur_voxel_generator._max_num_points == 1:
-            points_numpy = points_numpy.squeeze(1)
-        results['points'] = points.new_point(points_numpy[..., :original_dim])
-
-        # Restore the corresponding seg and mask fields
-        for key, dim_index in map_fields2dim:
-            results[key] = points_numpy[..., dim_index]
-
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-
-        def _auto_indent(repr_str, indent):
-            repr_str = repr_str.split('\n')
-            repr_str = [' ' * indent + t + '\n' for t in repr_str]
-            repr_str = ''.join(repr_str)[:-1]
-            return repr_str
-
-        repr_str = self.__class__.__name__
-        indent = 4
-        repr_str += '(\n'
-        repr_str += ' ' * indent + f'num_cur_sweep={self.cur_voxel_num},\n'
-        repr_str += ' ' * indent + f'num_prev_sweep={self.prev_voxel_num},\n'
-        repr_str += ' ' * indent + f'time_dim={self.time_dim},\n'
-        repr_str += ' ' * indent + 'cur_voxel_generator=\n'
-        repr_str += f'{_auto_indent(repr(self.cur_voxel_generator), 8)},\n'
-        repr_str += ' ' * indent + 'prev_voxel_generator=\n'
-        repr_str += f'{_auto_indent(repr(self.prev_voxel_generator), 8)})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class AffineResize(object):
-    """Get the affine transform matrices to the target size.
-
-    Different from :class:`RandomAffine` in MMDetection, this class can
-    calculate the affine transform matrices while resizing the input image
-    to a fixed size. The affine transform matrices include: 1) matrix
-    transforming original image to the network input image size. 2) matrix
-    transforming original image to the network output feature map size.
-
-    Args:
-        img_scale (tuple): Images scales for resizing.
-        down_ratio (int): The down ratio of feature map.
-            Actually the arg should be >= 1.
-        bbox_clip_border (bool, optional): Whether clip the objects
-            outside the border of the image. Defaults to True.
-    """
-
-    def __init__(self, img_scale, down_ratio, bbox_clip_border=True):
-
-        self.img_scale = img_scale
-        self.down_ratio = down_ratio
-        self.bbox_clip_border = bbox_clip_border
-
-    def __call__(self, results):
-        """Call function to do affine transform to input image and labels.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after affine resize, 'affine_aug', 'trans_mat'
-                keys are added in the result dict.
-        """
-        # The results have gone through RandomShiftScale before AffineResize
-        if 'center' not in results:
-            img = results['img']
-            height, width = img.shape[:2]
-            center = np.array([width / 2, height / 2], dtype=np.float32)
-            size = np.array([width, height], dtype=np.float32)
-            results['affine_aug'] = False
-        else:
-            # The results did not go through RandomShiftScale before
-            # AffineResize
-            img = results['img']
-            center = results['center']
-            size = results['size']
-
-        trans_affine = self._get_transform_matrix(center, size, self.img_scale)
-
-        img = cv2.warpAffine(img, trans_affine[:2, :], self.img_scale)
-
-        if isinstance(self.down_ratio, tuple):
-            trans_mat = [
-                self._get_transform_matrix(
-                    center, size,
-                    (self.img_scale[0] // ratio, self.img_scale[1] // ratio))
-                for ratio in self.down_ratio
-            ]  # (3, 3)
-        else:
-            trans_mat = self._get_transform_matrix(
-                center, size, (self.img_scale[0] // self.down_ratio,
-                               self.img_scale[1] // self.down_ratio))
-
-        results['img'] = img
-        results['img_shape'] = img.shape
-        results['pad_shape'] = img.shape
-        results['trans_mat'] = trans_mat
-
-        self._affine_bboxes(results, trans_affine)
-
-        if 'centers2d' in results:
-            centers2d = self._affine_transform(results['centers2d'],
-                                               trans_affine)
-            valid_index = (centers2d[:, 0] >
-                           0) & (centers2d[:, 0] <
-                                 self.img_scale[0]) & (centers2d[:, 1] > 0) & (
-                                     centers2d[:, 1] < self.img_scale[1])
-            results['centers2d'] = centers2d[valid_index]
-
-            for key in results.get('bbox_fields', []):
-                if key in ['gt_bboxes']:
-                    results[key] = results[key][valid_index]
-                    if 'gt_labels' in results:
-                        results['gt_labels'] = results['gt_labels'][
-                            valid_index]
-                    if 'gt_masks' in results:
-                        raise NotImplementedError(
-                            'AffineResize only supports bbox.')
-
-            for key in results.get('bbox3d_fields', []):
-                if key in ['gt_bboxes_3d']:
-                    results[key].tensor = results[key].tensor[valid_index]
-                    if 'gt_labels_3d' in results:
-                        results['gt_labels_3d'] = results['gt_labels_3d'][
-                            valid_index]
-
-            results['depths'] = results['depths'][valid_index]
-
-        return results
-
-    def _affine_bboxes(self, results, matrix):
-        """Affine transform bboxes to input image.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-            matrix (np.ndarray): Matrix transforming original
-                image to the network input image size.
-                shape: (3, 3)
-        """
-
-        for key in results.get('bbox_fields', []):
-            bboxes = results[key]
-            bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix)
-            bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix)
-            if self.bbox_clip_border:
-                bboxes[:,
-                       [0, 2]] = bboxes[:,
-                                        [0, 2]].clip(0, self.img_scale[0] - 1)
-                bboxes[:,
-                       [1, 3]] = bboxes[:,
-                                        [1, 3]].clip(0, self.img_scale[1] - 1)
-            results[key] = bboxes
-
-    def _affine_transform(self, points, matrix):
-        """Affine transform bbox points to input image.
-
-        Args:
-            points (np.ndarray): Points to be transformed.
-                shape: (N, 2)
-            matrix (np.ndarray): Affine transform matrix.
-                shape: (3, 3)
-
-        Returns:
-            np.ndarray: Transformed points.
-        """
-        num_points = points.shape[0]
-        hom_points_2d = np.concatenate((points, np.ones((num_points, 1))),
-                                       axis=1)
-        hom_points_2d = hom_points_2d.T
-        affined_points = np.matmul(matrix, hom_points_2d).T
-        return affined_points[:, :2]
-
-    def _get_transform_matrix(self, center, scale, output_scale):
-        """Get affine transform matrix.
-
-        Args:
-            center (tuple): Center of current image.
-            scale (tuple): Scale of current image.
-            output_scale (tuple[float]): The transform target image scales.
-
-        Returns:
-            np.ndarray: Affine transform matrix.
-        """
-        # TODO: further add rot and shift here.
-        src_w = scale[0]
-        dst_w = output_scale[0]
-        dst_h = output_scale[1]
-
-        src_dir = np.array([0, src_w * -0.5])
-        dst_dir = np.array([0, dst_w * -0.5])
-
-        src = np.zeros((3, 2), dtype=np.float32)
-        dst = np.zeros((3, 2), dtype=np.float32)
-        src[0, :] = center
-        src[1, :] = center + src_dir
-        dst[0, :] = np.array([dst_w * 0.5, dst_h * 0.5])
-        dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
-
-        src[2, :] = self._get_ref_point(src[0, :], src[1, :])
-        dst[2, :] = self._get_ref_point(dst[0, :], dst[1, :])
-
-        get_matrix = cv2.getAffineTransform(src, dst)
-
-        matrix = np.concatenate((get_matrix, [[0., 0., 1.]]))
-
-        return matrix.astype(np.float32)
-
-    def _get_ref_point(self, ref_point1, ref_point2):
-        """Get reference point to calculate affine transform matrix.
-
-        While using opencv to calculate the affine matrix, we need at least
-        three corresponding points separately on original image and target
-        image. Here we use two points to get the the third reference point.
-        """
-        d = ref_point1 - ref_point2
-        ref_point3 = ref_point2 + np.array([-d[1], d[0]])
-        return ref_point3
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(img_scale={self.img_scale}, '
-        repr_str += f'down_ratio={self.down_ratio}) '
-        return repr_str
-
-
-@PIPELINES.register_module()
-class RandomShiftScale(object):
-    """Random shift scale.
-
-    Different from the normal shift and scale function, it doesn't
-    directly shift or scale image. It can record the shift and scale
-    infos into loading pipelines. It's designed to be used with
-    AffineResize together.
-
-    Args:
-        shift_scale (tuple[float]): Shift and scale range.
-        aug_prob (float): The shifting and scaling probability.
-    """
-
-    def __init__(self, shift_scale, aug_prob):
-
-        self.shift_scale = shift_scale
-        self.aug_prob = aug_prob
-
-    def __call__(self, results):
-        """Call function to record random shift and scale infos.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after random shift and scale, 'center', 'size'
-                and 'affine_aug' keys are added in the result dict.
-        """
-        img = results['img']
-
-        height, width = img.shape[:2]
-
-        center = np.array([width / 2, height / 2], dtype=np.float32)
-        size = np.array([width, height], dtype=np.float32)
-
-        if random.random() < self.aug_prob:
-            shift, scale = self.shift_scale[0], self.shift_scale[1]
-            shift_ranges = np.arange(-shift, shift + 0.1, 0.1)
-            center[0] += size[0] * random.choice(shift_ranges)
-            center[1] += size[1] * random.choice(shift_ranges)
-            scale_ranges = np.arange(1 - scale, 1 + scale + 0.1, 0.1)
-            size *= random.choice(scale_ranges)
-            results['affine_aug'] = True
-        else:
-            results['affine_aug'] = False
-
-        results['center'] = center
-        results['size'] = size
-
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(shift_scale={self.shift_scale}, '
-        repr_str += f'aug_prob={self.aug_prob}) '
-        return repr_str
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+import warnings
+
+import cv2
+import numpy as np
+from mmcv import is_tuple_of
+from mmcv.utils import build_from_cfg
+
+from mmdet3d.core import VoxelGenerator
+from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes,
+                               LiDARInstance3DBoxes, box_np_ops)
+from mmdet3d.datasets.pipelines.compose import Compose
+from mmdet.datasets.pipelines import RandomCrop, RandomFlip, Rotate
+from ..builder import OBJECTSAMPLERS, PIPELINES
+from .data_augment_utils import noise_per_object_v3_
+
+
+@PIPELINES.register_module()
+class RandomDropPointsColor(object):
+    r"""Randomly set the color of points to all zeros.
+
+    Once this transform is executed, all the points' color will be dropped.
+    Refer to `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/
+    util/transform.py#L223>`_ for more details.
+
+    Args:
+        drop_ratio (float, optional): The probability of dropping point colors.
+            Defaults to 0.2.
+    """
+
+    def __init__(self, drop_ratio=0.2):
+        assert isinstance(drop_ratio, (int, float)) and 0 <= drop_ratio <= 1, \
+            f'invalid drop_ratio value {drop_ratio}'
+        self.drop_ratio = drop_ratio
+
+    def __call__(self, input_dict):
+        """Call function to drop point colors.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after color dropping,
+                'points' key is updated in the result dict.
+        """
+        points = input_dict['points']
+        assert points.attribute_dims is not None and \
+            'color' in points.attribute_dims, \
+            'Expect points have color attribute'
+
+        # this if-expression is a bit strange
+        # `RandomDropPointsColor` is used in training 3D segmentor PAConv
+        # we discovered in our experiments that, using
+        # `if np.random.rand() > 1.0 - self.drop_ratio` consistently leads to
+        # better results than using `if np.random.rand() < self.drop_ratio`
+        # so we keep this hack in our codebase
+        if np.random.rand() > 1.0 - self.drop_ratio:
+            points.color = points.color * 0.0
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(drop_ratio={self.drop_ratio})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlip3D(RandomFlip):
+    """Flip the points & bbox.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    Args:
+        sync_2d (bool, optional): Whether to apply flip according to the 2D
+            images. If True, it will apply the same flip as that to 2D images.
+            If False, it will decide whether to flip randomly and independently
+            to that of 2D images. Defaults to True.
+        flip_ratio_bev_horizontal (float, optional): The flipping probability
+            in horizontal direction. Defaults to 0.0.
+        flip_ratio_bev_vertical (float, optional): The flipping probability
+            in vertical direction. Defaults to 0.0.
+    """
+
+    def __init__(self,
+                 sync_2d=True,
+                 flip_ratio_bev_horizontal=0.0,
+                 flip_ratio_bev_vertical=0.0,
+                 **kwargs):
+        super(RandomFlip3D, self).__init__(
+            flip_ratio=flip_ratio_bev_horizontal, **kwargs)
+        self.sync_2d = sync_2d
+        self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
+        if flip_ratio_bev_horizontal is not None:
+            assert isinstance(
+                flip_ratio_bev_horizontal,
+                (int, float)) and 0 <= flip_ratio_bev_horizontal <= 1
+        if flip_ratio_bev_vertical is not None:
+            assert isinstance(
+                flip_ratio_bev_vertical,
+                (int, float)) and 0 <= flip_ratio_bev_vertical <= 1
+
+    def random_flip_data_3d(self, input_dict, direction='horizontal'):
+        """Flip 3D data randomly.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            direction (str, optional): Flip direction.
+                Default: 'horizontal'.
+
+        Returns:
+            dict: Flipped results, 'points', 'bbox3d_fields' keys are
+                updated in the result dict.
+        """
+        assert direction in ['horizontal', 'vertical']
+        # for semantic segmentation task, only points will be flipped.
+        if 'bbox3d_fields' not in input_dict:
+            input_dict['points'].flip(direction)
+            return
+        if len(input_dict['bbox3d_fields']) == 0:  # test mode
+            input_dict['bbox3d_fields'].append('empty_box3d')
+            input_dict['empty_box3d'] = input_dict['box_type_3d'](
+                np.array([], dtype=np.float32))
+        assert len(input_dict['bbox3d_fields']) == 1
+        for key in input_dict['bbox3d_fields']:
+            if 'points' in input_dict:
+                input_dict['points'] = input_dict[key].flip(
+                    direction, points=input_dict['points'])
+            else:
+                input_dict[key].flip(direction)
+        if 'centers2d' in input_dict:
+            assert self.sync_2d is True and direction == 'horizontal', \
+                'Only support sync_2d=True and horizontal flip with images'
+            w = input_dict['ori_shape'][1]
+            input_dict['centers2d'][..., 0] = \
+                w - input_dict['centers2d'][..., 0]
+            # need to modify the horizontal position of camera center
+            # along u-axis in the image (flip like centers2d)
+            # ['cam2img'][0][2] = c_u
+            # see more details and examples at
+            # https://github.com/open-mmlab/mmdetection3d/pull/744
+            input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2]
+
+    def __call__(self, input_dict):
+        """Call function to flip points, values in the ``bbox3d_fields`` and
+        also flip 2D image and its annotations.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip', 'flip_direction',
+                'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added
+                into result dict.
+        """
+        # flip 2D image and its annotations
+        super(RandomFlip3D, self).__call__(input_dict)
+
+        if self.sync_2d:
+            input_dict['pcd_horizontal_flip'] = input_dict['flip']
+            input_dict['pcd_vertical_flip'] = False
+        else:
+            if 'pcd_horizontal_flip' not in input_dict:
+                flip_horizontal = True if np.random.rand(
+                ) < self.flip_ratio else False
+                input_dict['pcd_horizontal_flip'] = flip_horizontal
+            if 'pcd_vertical_flip' not in input_dict:
+                flip_vertical = True if np.random.rand(
+                ) < self.flip_ratio_bev_vertical else False
+                input_dict['pcd_vertical_flip'] = flip_vertical
+
+        if 'transformation_3d_flow' not in input_dict:
+            input_dict['transformation_3d_flow'] = []
+
+        if input_dict['pcd_horizontal_flip']:
+            self.random_flip_data_3d(input_dict, 'horizontal')
+            input_dict['transformation_3d_flow'].extend(['HF'])
+        if input_dict['pcd_vertical_flip']:
+            self.random_flip_data_3d(input_dict, 'vertical')
+            input_dict['transformation_3d_flow'].extend(['VF'])
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(sync_2d={self.sync_2d},'
+        repr_str += f' flip_ratio_bev_vertical={self.flip_ratio_bev_vertical})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class MultiViewWrapper(object):
+    """Wrap transformation from single-view into multi-view.
+
+    The wrapper processes the images from multi-view one by one. For each
+    image, it constructs a pseudo dict according to the keys specified by the
+    'process_fields' parameter. After the transformation is finished, desired
+    information can be collected by specifying the keys in the 'collected_keys'
+    parameter. Multi-view images share the same transformation parameters
+    but do not share the same magnitude when a random transformation is
+    conducted.
+
+    Args:
+        transforms (list[dict]): A list of dict specifying the transformations
+            for the monocular situation.
+        process_fields (dict): Desired keys that the transformations should
+            be conducted on. Default to dict(img_fields=['img']).
+        collected_keys (list[str]): Collect information in transformation
+            like rotate angles, crop roi, and flip state.
+    """
+
+    def __init__(self,
+                 transforms,
+                 process_fields=dict(img_fields=['img']),
+                 collected_keys=[]):
+        self.transform = Compose(transforms)
+        self.collected_keys = collected_keys
+        self.process_fields = process_fields
+
+    def __call__(self, input_dict):
+        for key in self.collected_keys:
+            input_dict[key] = []
+        for img_id in range(len(input_dict['img'])):
+            process_dict = self.process_fields.copy()
+            for field in self.process_fields:
+                for key in self.process_fields[field]:
+                    process_dict[key] = input_dict[key][img_id]
+            process_dict = self.transform(process_dict)
+            for field in self.process_fields:
+                for key in self.process_fields[field]:
+                    input_dict[key][img_id] = process_dict[key]
+            for key in self.collected_keys:
+                input_dict[key].append(process_dict[key])
+        return input_dict
+
+
+@PIPELINES.register_module()
+class RangeLimitedRandomCrop(RandomCrop):
+    """Randomly crop image-view objects under a limitation of range.
+
+    Args:
+        relative_x_offset_range (tuple[float]): Relative range of random crop
+            in x direction. (x_min, x_max) in [0, 1.0]. Default to (0.0, 1.0).
+        relative_y_offset_range (tuple[float]): Relative range of random crop
+            in y direction. (y_min, y_max) in [0, 1.0]. Default to (0.0, 1.0).
+    """
+
+    def __init__(self,
+                 relative_x_offset_range=(0.0, 1.0),
+                 relative_y_offset_range=(0.0, 1.0),
+                 **kwargs):
+        super(RangeLimitedRandomCrop, self).__init__(**kwargs)
+        for range in [relative_x_offset_range, relative_y_offset_range]:
+            assert 0 <= range[0] <= range[1] <= 1
+        self.relative_x_offset_range = relative_x_offset_range
+        self.relative_y_offset_range = relative_y_offset_range
+
+    def _crop_data(self, results, crop_size, allow_negative_crop):
+        """Function to randomly crop images.
+
+        Modified from RandomCrop in mmdet==2.25.0
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_size (tuple): Expected absolute size after cropping, (h, w).
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            margin_h = max(img.shape[0] - crop_size[0], 0)
+            margin_w = max(img.shape[1] - crop_size[1], 0)
+            offset_range_h = (margin_h * self.relative_y_offset_range[0],
+                              margin_h * self.relative_y_offset_range[1] + 1)
+            offset_h = np.random.randint(*offset_range_h)
+            offset_range_w = (margin_w * self.relative_x_offset_range[0],
+                              margin_w * self.relative_x_offset_range[1] + 1)
+            offset_w = np.random.randint(*offset_range_w)
+            crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+            crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+
+            # crop the image
+            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+            img_shape = img.shape
+            results[key] = img
+            results['crop'] = (crop_x1, crop_y1, crop_x2, crop_y2)
+        results['img_shape'] = img_shape
+
+        # crop bboxes accordingly and clip to the image boundary
+        for key in results.get('bbox_fields', []):
+            # e.g. gt_bboxes and gt_bboxes_ignore
+            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
+                                   dtype=np.float32)
+            bboxes = results[key] - bbox_offset
+            if self.bbox_clip_border:
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
+                bboxes[:, 3] > bboxes[:, 1])
+            # If the crop does not contain any gt-bbox area and
+            # allow_negative_crop is False, skip this image.
+            if (key == 'gt_bboxes' and not valid_inds.any()
+                    and not allow_negative_crop):
+                return None
+            results[key] = bboxes[valid_inds, :]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = self.bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = self.bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+                if self.recompute_bbox:
+                    results[key] = results[mask_key].get_bboxes()
+
+        # crop semantic seg
+        for key in results.get('seg_fields', []):
+            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
+
+        return results
+
+
+@PIPELINES.register_module()
+class RandomRotate(Rotate):
+    """Randomly rotate images.
+
+    The ratation angle is selected uniformly within the interval specified by
+    the 'range'  parameter.
+
+    Args:
+        range (tuple[float]): Define the range of random rotation.
+            (angle_min, angle_max) in angle.
+    """
+
+    def __init__(self, range, **kwargs):
+        super(RandomRotate, self).__init__(**kwargs)
+        self.range = range
+
+    def __call__(self, results):
+        self.angle = np.random.uniform(self.range[0], self.range[1])
+        super(RandomRotate, self).__call__(results)
+        results['rotate'] = self.angle
+        return results
+
+
+@PIPELINES.register_module()
+class RandomJitterPoints(object):
+    """Randomly jitter point coordinates.
+
+    Different from the global translation in ``GlobalRotScaleTrans``, here we
+        apply different noises to each point in a scene.
+
+    Args:
+        jitter_std (list[float]): The standard deviation of jittering noise.
+            This applies random noise to all points in a 3D scene, which is
+            sampled from a gaussian distribution whose standard deviation is
+            set by ``jitter_std``. Defaults to [0.01, 0.01, 0.01]
+        clip_range (list[float]): Clip the randomly generated jitter
+            noise into this range. If None is given, don't perform clipping.
+            Defaults to [-0.05, 0.05]
+
+    Note:
+        This transform should only be used in point cloud segmentation tasks
+            because we don't transform ground-truth bboxes accordingly.
+        For similar transform in detection task, please refer to `ObjectNoise`.
+    """
+
+    def __init__(self,
+                 jitter_std=[0.01, 0.01, 0.01],
+                 clip_range=[-0.05, 0.05]):
+        seq_types = (list, tuple, np.ndarray)
+        if not isinstance(jitter_std, seq_types):
+            assert isinstance(jitter_std, (int, float)), \
+                f'unsupported jitter_std type {type(jitter_std)}'
+            jitter_std = [jitter_std, jitter_std, jitter_std]
+        self.jitter_std = jitter_std
+
+        if clip_range is not None:
+            if not isinstance(clip_range, seq_types):
+                assert isinstance(clip_range, (int, float)), \
+                    f'unsupported clip_range type {type(clip_range)}'
+                clip_range = [-clip_range, clip_range]
+        self.clip_range = clip_range
+
+    def __call__(self, input_dict):
+        """Call function to jitter all the points in the scene.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after adding noise to each point,
+                'points' key is updated in the result dict.
+        """
+        points = input_dict['points']
+        jitter_std = np.array(self.jitter_std, dtype=np.float32)
+        jitter_noise = \
+            np.random.randn(points.shape[0], 3) * jitter_std[None, :]
+        if self.clip_range is not None:
+            jitter_noise = np.clip(jitter_noise, self.clip_range[0],
+                                   self.clip_range[1])
+
+        points.translate(jitter_noise)
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(jitter_std={self.jitter_std},'
+        repr_str += f' clip_range={self.clip_range})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectSample(object):
+    """Sample GT objects to the data.
+
+    Args:
+        db_sampler (dict): Config dict of the database sampler.
+        sample_2d (bool): Whether to also paste 2D image patch to the images
+            This should be true when applying multi-modality cut-and-paste.
+            Defaults to False.
+        use_ground_plane (bool): Whether to use gound plane to adjust the
+            3D labels.
+    """
+
+    def __init__(self, db_sampler, sample_2d=False, use_ground_plane=False):
+        self.sampler_cfg = db_sampler
+        self.sample_2d = sample_2d
+        if 'type' not in db_sampler.keys():
+            db_sampler['type'] = 'DataBaseSampler'
+        self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS)
+        self.use_ground_plane = use_ground_plane
+
+    @staticmethod
+    def remove_points_in_boxes(points, boxes):
+        """Remove the points in the sampled bounding boxes.
+
+        Args:
+            points (:obj:`BasePoints`): Input point cloud array.
+            boxes (np.ndarray): Sampled ground truth boxes.
+
+        Returns:
+            np.ndarray: Points with those in the boxes removed.
+        """
+        masks = box_np_ops.points_in_rbbox(points.coord.numpy(), boxes)
+        points = points[np.logical_not(masks.any(-1))]
+        return points
+
+    def __call__(self, input_dict):
+        """Call function to sample ground truth objects to the data.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after object sampling augmentation,
+                'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated
+                in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+
+        if self.use_ground_plane and 'plane' in input_dict['ann_info']:
+            ground_plane = input_dict['ann_info']['plane']
+            input_dict['plane'] = ground_plane
+        else:
+            ground_plane = None
+        # change to float for blending operation
+        points = input_dict['points']
+        if self.sample_2d:
+            img = input_dict['img']
+            gt_bboxes_2d = input_dict['gt_bboxes']
+            # Assume for now 3D & 2D bboxes are the same
+            sampled_dict = self.db_sampler.sample_all(
+                gt_bboxes_3d.tensor.numpy(),
+                gt_labels_3d,
+                gt_bboxes_2d=gt_bboxes_2d,
+                img=img)
+        else:
+            sampled_dict = self.db_sampler.sample_all(
+                gt_bboxes_3d.tensor.numpy(),
+                gt_labels_3d,
+                img=None,
+                ground_plane=ground_plane)
+
+        if sampled_dict is not None:
+            sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d']
+            sampled_points = sampled_dict['points']
+            sampled_gt_labels = sampled_dict['gt_labels_3d']
+
+            gt_labels_3d = np.concatenate([gt_labels_3d, sampled_gt_labels],
+                                          axis=0)
+            gt_bboxes_3d = gt_bboxes_3d.new_box(
+                np.concatenate(
+                    [gt_bboxes_3d.tensor.numpy(), sampled_gt_bboxes_3d]))
+
+            points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d)
+            # check the points dimension
+            points = points.cat([sampled_points, points])
+
+            if self.sample_2d:
+                sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d']
+                gt_bboxes_2d = np.concatenate(
+                    [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32)
+
+                input_dict['gt_bboxes'] = gt_bboxes_2d
+                input_dict['img'] = sampled_dict['img']
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d.astype(np.int64)
+        input_dict['points'] = points
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f' sample_2d={self.sample_2d},'
+        repr_str += f' data_root={self.sampler_cfg.data_root},'
+        repr_str += f' info_path={self.sampler_cfg.info_path},'
+        repr_str += f' rate={self.sampler_cfg.rate},'
+        repr_str += f' prepare={self.sampler_cfg.prepare},'
+        repr_str += f' classes={self.sampler_cfg.classes},'
+        repr_str += f' sample_groups={self.sampler_cfg.sample_groups}'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectNoise(object):
+    """Apply noise to each GT objects in the scene.
+
+    Args:
+        translation_std (list[float], optional): Standard deviation of the
+            distribution where translation noise are sampled from.
+            Defaults to [0.25, 0.25, 0.25].
+        global_rot_range (list[float], optional): Global rotation to the scene.
+            Defaults to [0.0, 0.0].
+        rot_range (list[float], optional): Object rotation range.
+            Defaults to [-0.15707963267, 0.15707963267].
+        num_try (int, optional): Number of times to try if the noise applied is
+            invalid. Defaults to 100.
+    """
+
+    def __init__(self,
+                 translation_std=[0.25, 0.25, 0.25],
+                 global_rot_range=[0.0, 0.0],
+                 rot_range=[-0.15707963267, 0.15707963267],
+                 num_try=100):
+        self.translation_std = translation_std
+        self.global_rot_range = global_rot_range
+        self.rot_range = rot_range
+        self.num_try = num_try
+
+    def __call__(self, input_dict):
+        """Call function to apply noise to each ground truth in the scene.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after adding noise to each object,
+                'points', 'gt_bboxes_3d' keys are updated in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        points = input_dict['points']
+
+        # TODO: this is inplace operation
+        numpy_box = gt_bboxes_3d.tensor.numpy()
+        numpy_points = points.tensor.numpy()
+
+        noise_per_object_v3_(
+            numpy_box,
+            numpy_points,
+            rotation_perturb=self.rot_range,
+            center_noise_std=self.translation_std,
+            global_random_rot_range=self.global_rot_range,
+            num_try=self.num_try)
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.new_box(numpy_box)
+        input_dict['points'] = points.new_point(numpy_points)
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_try={self.num_try},'
+        repr_str += f' translation_std={self.translation_std},'
+        repr_str += f' global_rot_range={self.global_rot_range},'
+        repr_str += f' rot_range={self.rot_range})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class GlobalAlignment(object):
+    """Apply global alignment to 3D scene points by rotation and translation.
+
+    Args:
+        rotation_axis (int): Rotation axis for points and bboxes rotation.
+
+    Note:
+        We do not record the applied rotation and translation as in
+            GlobalRotScaleTrans. Because usually, we do not need to reverse
+            the alignment step.
+        For example, ScanNet 3D detection task uses aligned ground-truth
+            bounding boxes for evaluation.
+    """
+
+    def __init__(self, rotation_axis):
+        self.rotation_axis = rotation_axis
+
+    def _trans_points(self, input_dict, trans_factor):
+        """Private function to translate points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            trans_factor (np.ndarray): Translation vector to be applied.
+
+        Returns:
+            dict: Results after translation, 'points' is updated in the dict.
+        """
+        input_dict['points'].translate(trans_factor)
+
+    def _rot_points(self, input_dict, rot_mat):
+        """Private function to rotate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            rot_mat (np.ndarray): Rotation matrix to be applied.
+
+        Returns:
+            dict: Results after rotation, 'points' is updated in the dict.
+        """
+        # input should be rot_mat_T so I transpose it here
+        input_dict['points'].rotate(rot_mat.T)
+
+    def _check_rot_mat(self, rot_mat):
+        """Check if rotation matrix is valid for self.rotation_axis.
+
+        Args:
+            rot_mat (np.ndarray): Rotation matrix to be applied.
+        """
+        is_valid = np.allclose(np.linalg.det(rot_mat), 1.0)
+        valid_array = np.zeros(3)
+        valid_array[self.rotation_axis] = 1.0
+        is_valid &= (rot_mat[self.rotation_axis, :] == valid_array).all()
+        is_valid &= (rot_mat[:, self.rotation_axis] == valid_array).all()
+        assert is_valid, f'invalid rotation matrix {rot_mat}'
+
+    def __call__(self, input_dict):
+        """Call function to shuffle points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after global alignment, 'points' and keys in
+                input_dict['bbox3d_fields'] are updated in the result dict.
+        """
+        assert 'axis_align_matrix' in input_dict['ann_info'].keys(), \
+            'axis_align_matrix is not provided in GlobalAlignment'
+
+        axis_align_matrix = input_dict['ann_info']['axis_align_matrix']
+        assert axis_align_matrix.shape == (4, 4), \
+            f'invalid shape {axis_align_matrix.shape} for axis_align_matrix'
+        rot_mat = axis_align_matrix[:3, :3]
+        trans_vec = axis_align_matrix[:3, -1]
+
+        self._check_rot_mat(rot_mat)
+        self._rot_points(input_dict, rot_mat)
+        self._trans_points(input_dict, trans_vec)
+
+        return input_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(rotation_axis={self.rotation_axis})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class GlobalRotScaleTrans(object):
+    """Apply global rotation, scaling and translation to a 3D scene.
+
+    Args:
+        rot_range (list[float], optional): Range of rotation angle.
+            Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
+        scale_ratio_range (list[float], optional): Range of scale ratio.
+            Defaults to [0.95, 1.05].
+        translation_std (list[float], optional): The standard deviation of
+            translation noise applied to a scene, which
+            is sampled from a gaussian distribution whose standard deviation
+            is set by ``translation_std``. Defaults to [0, 0, 0]
+        shift_height (bool, optional): Whether to shift height.
+            (the fourth dimension of indoor points) when scaling.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 rot_range=[-0.78539816, 0.78539816],
+                 scale_ratio_range=[0.95, 1.05],
+                 translation_std=[0, 0, 0],
+                 shift_height=False):
+        seq_types = (list, tuple, np.ndarray)
+        if not isinstance(rot_range, seq_types):
+            assert isinstance(rot_range, (int, float)), \
+                f'unsupported rot_range type {type(rot_range)}'
+            rot_range = [-rot_range, rot_range]
+        self.rot_range = rot_range
+
+        assert isinstance(scale_ratio_range, seq_types), \
+            f'unsupported scale_ratio_range type {type(scale_ratio_range)}'
+        self.scale_ratio_range = scale_ratio_range
+
+        if not isinstance(translation_std, seq_types):
+            assert isinstance(translation_std, (int, float)), \
+                f'unsupported translation_std type {type(translation_std)}'
+            translation_std = [
+                translation_std, translation_std, translation_std
+            ]
+        assert all([std >= 0 for std in translation_std]), \
+            'translation_std should be positive'
+        self.translation_std = translation_std
+        self.shift_height = shift_height
+
+    def _trans_bbox_points(self, input_dict):
+        """Private function to translate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after translation, 'points', 'pcd_trans'
+                and keys in input_dict['bbox3d_fields'] are updated
+                in the result dict.
+        """
+        translation_std = np.array(self.translation_std, dtype=np.float32)
+        trans_factor = np.random.normal(scale=translation_std, size=3).T
+
+        input_dict['points'].translate(trans_factor)
+        input_dict['pcd_trans'] = trans_factor
+        for key in input_dict['bbox3d_fields']:
+            input_dict[key].translate(trans_factor)
+
+    def _rot_bbox_points(self, input_dict):
+        """Private function to rotate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after rotation, 'points', 'pcd_rotation'
+                and keys in input_dict['bbox3d_fields'] are updated
+                in the result dict.
+        """
+        rotation = self.rot_range
+        noise_rotation = np.random.uniform(rotation[0], rotation[1])
+
+        # if no bbox in input_dict, only rotate points
+        if len(input_dict['bbox3d_fields']) == 0:
+            rot_mat_T = input_dict['points'].rotate(noise_rotation)
+            input_dict['pcd_rotation'] = rot_mat_T
+            input_dict['pcd_rotation_angle'] = noise_rotation
+            return
+
+        # rotate points with bboxes
+        for key in input_dict['bbox3d_fields']:
+            if len(input_dict[key].tensor) != 0:
+                points, rot_mat_T = input_dict[key].rotate(
+                    noise_rotation, input_dict['points'])
+                input_dict['points'] = points
+                input_dict['pcd_rotation'] = rot_mat_T
+                input_dict['pcd_rotation_angle'] = noise_rotation
+
+    def _scale_bbox_points(self, input_dict):
+        """Private function to scale bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'points'and keys in
+                input_dict['bbox3d_fields'] are updated in the result dict.
+        """
+        scale = input_dict['pcd_scale_factor']
+        points = input_dict['points']
+        points.scale(scale)
+        if self.shift_height:
+            assert 'height' in points.attribute_dims.keys(), \
+                'setting shift_height=True but points have no height attribute'
+            points.tensor[:, points.attribute_dims['height']] *= scale
+        input_dict['points'] = points
+
+        for key in input_dict['bbox3d_fields']:
+            input_dict[key].scale(scale)
+
+    def _random_scale(self, input_dict):
+        """Private function to randomly set the scale factor.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'pcd_scale_factor' are updated
+                in the result dict.
+        """
+        scale_factor = np.random.uniform(self.scale_ratio_range[0],
+                                         self.scale_ratio_range[1])
+        input_dict['pcd_scale_factor'] = scale_factor
+
+    def __call__(self, input_dict):
+        """Private function to rotate, scale and translate bounding boxes and
+        points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'points', 'pcd_rotation',
+                'pcd_scale_factor', 'pcd_trans' and keys in
+                input_dict['bbox3d_fields'] are updated in the result dict.
+        """
+        if 'transformation_3d_flow' not in input_dict:
+            input_dict['transformation_3d_flow'] = []
+
+        self._rot_bbox_points(input_dict)
+
+        if 'pcd_scale_factor' not in input_dict:
+            self._random_scale(input_dict)
+        self._scale_bbox_points(input_dict)
+
+        self._trans_bbox_points(input_dict)
+
+        input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(rot_range={self.rot_range},'
+        repr_str += f' scale_ratio_range={self.scale_ratio_range},'
+        repr_str += f' translation_std={self.translation_std},'
+        repr_str += f' shift_height={self.shift_height})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PointShuffle(object):
+    """Shuffle input points."""
+
+    def __call__(self, input_dict):
+        """Call function to shuffle points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        idx = input_dict['points'].shuffle()
+        idx = idx.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[idx]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[idx]
+
+        return input_dict
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class ObjectRangeFilter(object):
+    """Filter objects by the range.
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+                keys are updated in the result dict.
+        """
+        # Check points instance type and initialise bev_range
+        if isinstance(input_dict['gt_bboxes_3d'],
+                      (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            bev_range = self.pcd_range[[0, 1, 3, 4]]
+        elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
+            bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        mask = gt_bboxes_3d.in_range_bev(bev_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PointsRangeFilter(object):
+    """Filter points by the range.
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, input_dict):
+        """Call function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        points_mask = points.in_range_3d(self.pcd_range)
+        clean_points = points[points_mask]
+        input_dict['points'] = clean_points
+        points_mask = points_mask.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectNameFilter(object):
+    """Filter GT objects by their names.
+
+    Args:
+        classes (list[str]): List of class names to be kept for training.
+    """
+
+    def __init__(self, classes):
+        self.classes = classes
+        self.labels = list(range(len(self.classes)))
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by their names.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+                keys are updated in the result dict.
+        """
+        gt_labels_3d = input_dict['gt_labels_3d']
+        gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+                                  dtype=np.bool_)
+        input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
+        input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(classes={self.classes})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PointSample(object):
+    """Point sample.
+
+    Sampling data to a certain number.
+
+    Args:
+        num_points (int): Number of points to be sampled.
+        sample_range (float, optional): The range where to sample points.
+            If not None, the points with depth larger than `sample_range` are
+            prior to be sampled. Defaults to None.
+        replace (bool, optional): Whether the sampling is with or without
+            replacement. Defaults to False.
+    """
+
+    def __init__(self, num_points, sample_range=None, replace=False):
+        self.num_points = num_points
+        self.sample_range = sample_range
+        self.replace = replace
+
+    def _points_random_sampling(self,
+                                points,
+                                num_samples,
+                                sample_range=None,
+                                replace=False,
+                                return_choices=False):
+        """Points random sampling.
+
+        Sample points to a certain number.
+
+        Args:
+            points (np.ndarray | :obj:`BasePoints`): 3D Points.
+            num_samples (int): Number of samples to be sampled.
+            sample_range (float, optional): Indicating the range where the
+                points will be sampled. Defaults to None.
+            replace (bool, optional): Sampling with or without replacement.
+                Defaults to None.
+            return_choices (bool, optional): Whether return choice.
+                Defaults to False.
+        Returns:
+            tuple[np.ndarray] | np.ndarray:
+                - points (np.ndarray | :obj:`BasePoints`): 3D Points.
+                - choices (np.ndarray, optional): The generated random samples.
+        """
+        if not replace:
+            replace = (points.shape[0] < num_samples)
+        point_range = range(len(points))
+        if sample_range is not None and not replace:
+            # Only sampling the near points when len(points) >= num_samples
+            dist = np.linalg.norm(points.tensor, axis=1)
+            far_inds = np.where(dist >= sample_range)[0]
+            near_inds = np.where(dist < sample_range)[0]
+            # in case there are too many far points
+            if len(far_inds) > num_samples:
+                far_inds = np.random.choice(
+                    far_inds, num_samples, replace=False)
+            point_range = near_inds
+            num_samples -= len(far_inds)
+        choices = np.random.choice(point_range, num_samples, replace=replace)
+        if sample_range is not None and not replace:
+            choices = np.concatenate((far_inds, choices))
+            # Shuffle points after sampling
+            np.random.shuffle(choices)
+        if return_choices:
+            return points[choices], choices
+        else:
+            return points[choices]
+
+    def __call__(self, results):
+        """Call function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = results['points']
+        points, choices = self._points_random_sampling(
+            points,
+            self.num_points,
+            self.sample_range,
+            self.replace,
+            return_choices=True)
+        results['points'] = points
+
+        pts_instance_mask = results.get('pts_instance_mask', None)
+        pts_semantic_mask = results.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            pts_instance_mask = pts_instance_mask[choices]
+            results['pts_instance_mask'] = pts_instance_mask
+
+        if pts_semantic_mask is not None:
+            pts_semantic_mask = pts_semantic_mask[choices]
+            results['pts_semantic_mask'] = pts_semantic_mask
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_points={self.num_points},'
+        repr_str += f' sample_range={self.sample_range},'
+        repr_str += f' replace={self.replace})'
+
+        return repr_str
+
+
+@PIPELINES.register_module()
+class IndoorPointSample(PointSample):
+    """Indoor point sample.
+
+    Sampling data to a certain number.
+    NOTE: IndoorPointSample is deprecated in favor of PointSample
+
+    Args:
+        num_points (int): Number of points to be sampled.
+    """
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            'IndoorPointSample is deprecated in favor of PointSample')
+        super(IndoorPointSample, self).__init__(*args, **kwargs)
+
+
+@PIPELINES.register_module()
+class IndoorPatchPointSample(object):
+    r"""Indoor point sample within a patch. Modified from `PointNet++ <https://
+    github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py>`_.
+
+    Sampling data to a certain number for semantic segmentation.
+
+    Args:
+        num_points (int): Number of points to be sampled.
+        block_size (float, optional): Size of a block to sample points from.
+            Defaults to 1.5.
+        sample_rate (float, optional): Stride used in sliding patch generation.
+            This parameter is unused in `IndoorPatchPointSample` and thus has
+            been deprecated. We plan to remove it in the future.
+            Defaults to None.
+        ignore_index (int, optional): Label index that won't be used for the
+            segmentation task. This is set in PointSegClassMapping as neg_cls.
+            If not None, will be used as a patch selection criterion.
+            Defaults to None.
+        use_normalized_coord (bool, optional): Whether to use normalized xyz as
+            additional features. Defaults to False.
+        num_try (int, optional): Number of times to try if the patch selected
+            is invalid. Defaults to 10.
+        enlarge_size (float, optional): Enlarge the sampled patch to
+            [-block_size / 2 - enlarge_size, block_size / 2 + enlarge_size] as
+            an augmentation. If None, set it as 0. Defaults to 0.2.
+        min_unique_num (int, optional): Minimum number of unique points
+            the sampled patch should contain. If None, use PointNet++'s method
+            to judge uniqueness. Defaults to None.
+        eps (float, optional): A value added to patch boundary to guarantee
+            points coverage. Defaults to 1e-2.
+
+    Note:
+        This transform should only be used in the training process of point
+            cloud segmentation tasks. For the sliding patch generation and
+            inference process in testing, please refer to the `slide_inference`
+            function of `EncoderDecoder3D` class.
+    """
+
+    def __init__(self,
+                 num_points,
+                 block_size=1.5,
+                 sample_rate=None,
+                 ignore_index=None,
+                 use_normalized_coord=False,
+                 num_try=10,
+                 enlarge_size=0.2,
+                 min_unique_num=None,
+                 eps=1e-2):
+        self.num_points = num_points
+        self.block_size = block_size
+        self.ignore_index = ignore_index
+        self.use_normalized_coord = use_normalized_coord
+        self.num_try = num_try
+        self.enlarge_size = enlarge_size if enlarge_size is not None else 0.0
+        self.min_unique_num = min_unique_num
+        self.eps = eps
+
+        if sample_rate is not None:
+            warnings.warn(
+                "'sample_rate' has been deprecated and will be removed in "
+                'the future. Please remove them from your code.')
+
+    def _input_generation(self, coords, patch_center, coord_max, attributes,
+                          attribute_dims, point_type):
+        """Generating model input.
+
+        Generate input by subtracting patch center and adding additional
+            features. Currently support colors and normalized xyz as features.
+
+        Args:
+            coords (np.ndarray): Sampled 3D Points.
+            patch_center (np.ndarray): Center coordinate of the selected patch.
+            coord_max (np.ndarray): Max coordinate of all 3D Points.
+            attributes (np.ndarray): features of input points.
+            attribute_dims (dict): Dictionary to indicate the meaning of extra
+                dimension.
+            point_type (type): class of input points inherited from BasePoints.
+
+        Returns:
+            :obj:`BasePoints`: The generated input data.
+        """
+        # subtract patch center, the z dimension is not centered
+        centered_coords = coords.copy()
+        centered_coords[:, 0] -= patch_center[0]
+        centered_coords[:, 1] -= patch_center[1]
+
+        if self.use_normalized_coord:
+            normalized_coord = coords / coord_max
+            attributes = np.concatenate([attributes, normalized_coord], axis=1)
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(normalized_coord=[
+                    attributes.shape[1], attributes.shape[1] +
+                    1, attributes.shape[1] + 2
+                ]))
+
+        points = np.concatenate([centered_coords, attributes], axis=1)
+        points = point_type(
+            points, points_dim=points.shape[1], attribute_dims=attribute_dims)
+
+        return points
+
+    def _patch_points_sampling(self, points, sem_mask):
+        """Patch points sampling.
+
+        First sample a valid patch.
+        Then sample points within that patch to a certain number.
+
+        Args:
+            points (:obj:`BasePoints`): 3D Points.
+            sem_mask (np.ndarray): semantic segmentation mask for input points.
+
+        Returns:
+            tuple[:obj:`BasePoints`, np.ndarray] | :obj:`BasePoints`:
+
+                - points (:obj:`BasePoints`): 3D Points.
+                - choices (np.ndarray): The generated random samples.
+        """
+        coords = points.coord.numpy()
+        attributes = points.tensor[:, 3:].numpy()
+        attribute_dims = points.attribute_dims
+        point_type = type(points)
+
+        coord_max = np.amax(coords, axis=0)
+        coord_min = np.amin(coords, axis=0)
+
+        for _ in range(self.num_try):
+            # random sample a point as patch center
+            cur_center = coords[np.random.choice(coords.shape[0])]
+
+            # boundary of a patch, which would be enlarged by
+            # `self.enlarge_size` as an augmentation
+            cur_max = cur_center + np.array(
+                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
+            cur_min = cur_center - np.array(
+                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
+            cur_max[2] = coord_max[2]
+            cur_min[2] = coord_min[2]
+            cur_choice = np.sum(
+                (coords >= (cur_min - self.enlarge_size)) *
+                (coords <= (cur_max + self.enlarge_size)),
+                axis=1) == 3
+
+            if not cur_choice.any():  # no points in this patch
+                continue
+
+            cur_coords = coords[cur_choice, :]
+            cur_sem_mask = sem_mask[cur_choice]
+            point_idxs = np.where(cur_choice)[0]
+            mask = np.sum(
+                (cur_coords >= (cur_min - self.eps)) * (cur_coords <=
+                                                        (cur_max + self.eps)),
+                axis=1) == 3
+
+            # two criteria for patch sampling, adopted from PointNet++
+            # 1. selected patch should contain enough unique points
+            if self.min_unique_num is None:
+                # use PointNet++'s method as default
+                # [31, 31, 62] are just some big values used to transform
+                # coords from 3d array to 1d and then check their uniqueness
+                # this is used in all the ScanNet code following PointNet++
+                vidx = np.ceil(
+                    (cur_coords[mask, :] - cur_min) / (cur_max - cur_min) *
+                    np.array([31.0, 31.0, 62.0]))
+                vidx = np.unique(vidx[:, 0] * 31.0 * 62.0 + vidx[:, 1] * 62.0 +
+                                 vidx[:, 2])
+                flag1 = len(vidx) / 31.0 / 31.0 / 62.0 >= 0.02
+            else:
+                # if `min_unique_num` is provided, directly compare with it
+                flag1 = mask.sum() >= self.min_unique_num
+
+            # 2. selected patch should contain enough annotated points
+            if self.ignore_index is None:
+                flag2 = True
+            else:
+                flag2 = np.sum(cur_sem_mask != self.ignore_index) / \
+                               len(cur_sem_mask) >= 0.7
+
+            if flag1 and flag2:
+                break
+
+        # sample idx to `self.num_points`
+        if point_idxs.size >= self.num_points:
+            # no duplicate in sub-sampling
+            choices = np.random.choice(
+                point_idxs, self.num_points, replace=False)
+        else:
+            # do not use random choice here to avoid some points not counted
+            dup = np.random.choice(point_idxs.size,
+                                   self.num_points - point_idxs.size)
+            idx_dup = np.concatenate(
+                [np.arange(point_idxs.size),
+                 np.array(dup)], 0)
+            choices = point_idxs[idx_dup]
+
+        # construct model input
+        points = self._input_generation(coords[choices], cur_center, coord_max,
+                                        attributes[choices], attribute_dims,
+                                        point_type)
+
+        return points, choices
+
+    def __call__(self, results):
+        """Call function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = results['points']
+
+        assert 'pts_semantic_mask' in results.keys(), \
+            'semantic mask should be provided in training and evaluation'
+        pts_semantic_mask = results['pts_semantic_mask']
+
+        points, choices = self._patch_points_sampling(points,
+                                                      pts_semantic_mask)
+
+        results['points'] = points
+        results['pts_semantic_mask'] = pts_semantic_mask[choices]
+        pts_instance_mask = results.get('pts_instance_mask', None)
+        if pts_instance_mask is not None:
+            results['pts_instance_mask'] = pts_instance_mask[choices]
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_points={self.num_points},'
+        repr_str += f' block_size={self.block_size},'
+        repr_str += f' ignore_index={self.ignore_index},'
+        repr_str += f' use_normalized_coord={self.use_normalized_coord},'
+        repr_str += f' num_try={self.num_try},'
+        repr_str += f' enlarge_size={self.enlarge_size},'
+        repr_str += f' min_unique_num={self.min_unique_num},'
+        repr_str += f' eps={self.eps})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class BackgroundPointsFilter(object):
+    """Filter background points near the bounding box.
+
+    Args:
+        bbox_enlarge_range (tuple[float], float): Bbox enlarge range.
+    """
+
+    def __init__(self, bbox_enlarge_range):
+        assert (is_tuple_of(bbox_enlarge_range, float)
+                and len(bbox_enlarge_range) == 3) \
+            or isinstance(bbox_enlarge_range, float), \
+            f'Invalid arguments bbox_enlarge_range {bbox_enlarge_range}'
+
+        if isinstance(bbox_enlarge_range, float):
+            bbox_enlarge_range = [bbox_enlarge_range] * 3
+        self.bbox_enlarge_range = np.array(
+            bbox_enlarge_range, dtype=np.float32)[np.newaxis, :]
+
+    def __call__(self, input_dict):
+        """Call function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+
+        # avoid groundtruth being modified
+        gt_bboxes_3d_np = gt_bboxes_3d.tensor.clone().numpy()
+        gt_bboxes_3d_np[:, :3] = gt_bboxes_3d.gravity_center.clone().numpy()
+
+        enlarged_gt_bboxes_3d = gt_bboxes_3d_np.copy()
+        enlarged_gt_bboxes_3d[:, 3:6] += self.bbox_enlarge_range
+        points_numpy = points.tensor.clone().numpy()
+        foreground_masks = box_np_ops.points_in_rbbox(
+            points_numpy, gt_bboxes_3d_np, origin=(0.5, 0.5, 0.5))
+        enlarge_foreground_masks = box_np_ops.points_in_rbbox(
+            points_numpy, enlarged_gt_bboxes_3d, origin=(0.5, 0.5, 0.5))
+        foreground_masks = foreground_masks.max(1)
+        enlarge_foreground_masks = enlarge_foreground_masks.max(1)
+        valid_masks = ~np.logical_and(~foreground_masks,
+                                      enlarge_foreground_masks)
+
+        input_dict['points'] = points[valid_masks]
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[valid_masks]
+
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[valid_masks]
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(bbox_enlarge_range={self.bbox_enlarge_range.tolist()})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class VoxelBasedPointSampler(object):
+    """Voxel based point sampler.
+
+    Apply voxel sampling to multiple sweep points.
+
+    Args:
+        cur_sweep_cfg (dict): Config for sampling current points.
+        prev_sweep_cfg (dict): Config for sampling previous points.
+        time_dim (int): Index that indicate the time dimension
+            for input points.
+    """
+
+    def __init__(self, cur_sweep_cfg, prev_sweep_cfg=None, time_dim=3):
+        self.cur_voxel_generator = VoxelGenerator(**cur_sweep_cfg)
+        self.cur_voxel_num = self.cur_voxel_generator._max_voxels
+        self.time_dim = time_dim
+        if prev_sweep_cfg is not None:
+            assert prev_sweep_cfg['max_num_points'] == \
+                cur_sweep_cfg['max_num_points']
+            self.prev_voxel_generator = VoxelGenerator(**prev_sweep_cfg)
+            self.prev_voxel_num = self.prev_voxel_generator._max_voxels
+        else:
+            self.prev_voxel_generator = None
+            self.prev_voxel_num = 0
+
+    def _sample_points(self, points, sampler, point_dim):
+        """Sample points for each points subset.
+
+        Args:
+            points (np.ndarray): Points subset to be sampled.
+            sampler (VoxelGenerator): Voxel based sampler for
+                each points subset.
+            point_dim (int): The dimension of each points
+
+        Returns:
+            np.ndarray: Sampled points.
+        """
+        voxels, coors, num_points_per_voxel = sampler.generate(points)
+        if voxels.shape[0] < sampler._max_voxels:
+            padding_points = np.zeros([
+                sampler._max_voxels - voxels.shape[0], sampler._max_num_points,
+                point_dim
+            ],
+                                      dtype=points.dtype)
+            padding_points[:] = voxels[0]
+            sample_points = np.concatenate([voxels, padding_points], axis=0)
+        else:
+            sample_points = voxels
+
+        return sample_points
+
+    def __call__(self, results):
+        """Call function to sample points from multiple sweeps.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = results['points']
+        original_dim = points.shape[1]
+
+        # TODO: process instance and semantic mask while _max_num_points
+        # is larger than 1
+        # Extend points with seg and mask fields
+        map_fields2dim = []
+        start_dim = original_dim
+        points_numpy = points.tensor.numpy()
+        extra_channel = [points_numpy]
+        for idx, key in enumerate(results['pts_mask_fields']):
+            map_fields2dim.append((key, idx + start_dim))
+            extra_channel.append(results[key][..., None])
+
+        start_dim += len(results['pts_mask_fields'])
+        for idx, key in enumerate(results['pts_seg_fields']):
+            map_fields2dim.append((key, idx + start_dim))
+            extra_channel.append(results[key][..., None])
+
+        points_numpy = np.concatenate(extra_channel, axis=-1)
+
+        # Split points into two part, current sweep points and
+        # previous sweeps points.
+        # TODO: support different sampling methods for next sweeps points
+        # and previous sweeps points.
+        cur_points_flag = (points_numpy[:, self.time_dim] == 0)
+        cur_sweep_points = points_numpy[cur_points_flag]
+        prev_sweeps_points = points_numpy[~cur_points_flag]
+        if prev_sweeps_points.shape[0] == 0:
+            prev_sweeps_points = cur_sweep_points
+
+        # Shuffle points before sampling
+        np.random.shuffle(cur_sweep_points)
+        np.random.shuffle(prev_sweeps_points)
+
+        cur_sweep_points = self._sample_points(cur_sweep_points,
+                                               self.cur_voxel_generator,
+                                               points_numpy.shape[1])
+        if self.prev_voxel_generator is not None:
+            prev_sweeps_points = self._sample_points(prev_sweeps_points,
+                                                     self.prev_voxel_generator,
+                                                     points_numpy.shape[1])
+
+            points_numpy = np.concatenate(
+                [cur_sweep_points, prev_sweeps_points], 0)
+        else:
+            points_numpy = cur_sweep_points
+
+        if self.cur_voxel_generator._max_num_points == 1:
+            points_numpy = points_numpy.squeeze(1)
+        results['points'] = points.new_point(points_numpy[..., :original_dim])
+
+        # Restore the corresponding seg and mask fields
+        for key, dim_index in map_fields2dim:
+            results[key] = points_numpy[..., dim_index]
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+
+        def _auto_indent(repr_str, indent):
+            repr_str = repr_str.split('\n')
+            repr_str = [' ' * indent + t + '\n' for t in repr_str]
+            repr_str = ''.join(repr_str)[:-1]
+            return repr_str
+
+        repr_str = self.__class__.__name__
+        indent = 4
+        repr_str += '(\n'
+        repr_str += ' ' * indent + f'num_cur_sweep={self.cur_voxel_num},\n'
+        repr_str += ' ' * indent + f'num_prev_sweep={self.prev_voxel_num},\n'
+        repr_str += ' ' * indent + f'time_dim={self.time_dim},\n'
+        repr_str += ' ' * indent + 'cur_voxel_generator=\n'
+        repr_str += f'{_auto_indent(repr(self.cur_voxel_generator), 8)},\n'
+        repr_str += ' ' * indent + 'prev_voxel_generator=\n'
+        repr_str += f'{_auto_indent(repr(self.prev_voxel_generator), 8)})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class AffineResize(object):
+    """Get the affine transform matrices to the target size.
+
+    Different from :class:`RandomAffine` in MMDetection, this class can
+    calculate the affine transform matrices while resizing the input image
+    to a fixed size. The affine transform matrices include: 1) matrix
+    transforming original image to the network input image size. 2) matrix
+    transforming original image to the network output feature map size.
+
+    Args:
+        img_scale (tuple): Images scales for resizing.
+        down_ratio (int): The down ratio of feature map.
+            Actually the arg should be >= 1.
+        bbox_clip_border (bool, optional): Whether clip the objects
+            outside the border of the image. Defaults to True.
+    """
+
+    def __init__(self, img_scale, down_ratio, bbox_clip_border=True):
+
+        self.img_scale = img_scale
+        self.down_ratio = down_ratio
+        self.bbox_clip_border = bbox_clip_border
+
+    def __call__(self, results):
+        """Call function to do affine transform to input image and labels.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after affine resize, 'affine_aug', 'trans_mat'
+                keys are added in the result dict.
+        """
+        # The results have gone through RandomShiftScale before AffineResize
+        if 'center' not in results:
+            img = results['img']
+            height, width = img.shape[:2]
+            center = np.array([width / 2, height / 2], dtype=np.float32)
+            size = np.array([width, height], dtype=np.float32)
+            results['affine_aug'] = False
+        else:
+            # The results did not go through RandomShiftScale before
+            # AffineResize
+            img = results['img']
+            center = results['center']
+            size = results['size']
+
+        trans_affine = self._get_transform_matrix(center, size, self.img_scale)
+
+        img = cv2.warpAffine(img, trans_affine[:2, :], self.img_scale)
+
+        if isinstance(self.down_ratio, tuple):
+            trans_mat = [
+                self._get_transform_matrix(
+                    center, size,
+                    (self.img_scale[0] // ratio, self.img_scale[1] // ratio))
+                for ratio in self.down_ratio
+            ]  # (3, 3)
+        else:
+            trans_mat = self._get_transform_matrix(
+                center, size, (self.img_scale[0] // self.down_ratio,
+                               self.img_scale[1] // self.down_ratio))
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['pad_shape'] = img.shape
+        results['trans_mat'] = trans_mat
+
+        self._affine_bboxes(results, trans_affine)
+
+        if 'centers2d' in results:
+            centers2d = self._affine_transform(results['centers2d'],
+                                               trans_affine)
+            valid_index = (centers2d[:, 0] >
+                           0) & (centers2d[:, 0] <
+                                 self.img_scale[0]) & (centers2d[:, 1] > 0) & (
+                                     centers2d[:, 1] < self.img_scale[1])
+            results['centers2d'] = centers2d[valid_index]
+
+            for key in results.get('bbox_fields', []):
+                if key in ['gt_bboxes']:
+                    results[key] = results[key][valid_index]
+                    if 'gt_labels' in results:
+                        results['gt_labels'] = results['gt_labels'][
+                            valid_index]
+                    if 'gt_masks' in results:
+                        raise NotImplementedError(
+                            'AffineResize only supports bbox.')
+
+            for key in results.get('bbox3d_fields', []):
+                if key in ['gt_bboxes_3d']:
+                    results[key].tensor = results[key].tensor[valid_index]
+                    if 'gt_labels_3d' in results:
+                        results['gt_labels_3d'] = results['gt_labels_3d'][
+                            valid_index]
+
+            results['depths'] = results['depths'][valid_index]
+
+        return results
+
+    def _affine_bboxes(self, results, matrix):
+        """Affine transform bboxes to input image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            matrix (np.ndarray): Matrix transforming original
+                image to the network input image size.
+                shape: (3, 3)
+        """
+
+        for key in results.get('bbox_fields', []):
+            bboxes = results[key]
+            bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix)
+            bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix)
+            if self.bbox_clip_border:
+                bboxes[:,
+                       [0, 2]] = bboxes[:,
+                                        [0, 2]].clip(0, self.img_scale[0] - 1)
+                bboxes[:,
+                       [1, 3]] = bboxes[:,
+                                        [1, 3]].clip(0, self.img_scale[1] - 1)
+            results[key] = bboxes
+
+    def _affine_transform(self, points, matrix):
+        """Affine transform bbox points to input image.
+
+        Args:
+            points (np.ndarray): Points to be transformed.
+                shape: (N, 2)
+            matrix (np.ndarray): Affine transform matrix.
+                shape: (3, 3)
+
+        Returns:
+            np.ndarray: Transformed points.
+        """
+        num_points = points.shape[0]
+        hom_points_2d = np.concatenate((points, np.ones((num_points, 1))),
+                                       axis=1)
+        hom_points_2d = hom_points_2d.T
+        affined_points = np.matmul(matrix, hom_points_2d).T
+        return affined_points[:, :2]
+
+    def _get_transform_matrix(self, center, scale, output_scale):
+        """Get affine transform matrix.
+
+        Args:
+            center (tuple): Center of current image.
+            scale (tuple): Scale of current image.
+            output_scale (tuple[float]): The transform target image scales.
+
+        Returns:
+            np.ndarray: Affine transform matrix.
+        """
+        # TODO: further add rot and shift here.
+        src_w = scale[0]
+        dst_w = output_scale[0]
+        dst_h = output_scale[1]
+
+        src_dir = np.array([0, src_w * -0.5])
+        dst_dir = np.array([0, dst_w * -0.5])
+
+        src = np.zeros((3, 2), dtype=np.float32)
+        dst = np.zeros((3, 2), dtype=np.float32)
+        src[0, :] = center
+        src[1, :] = center + src_dir
+        dst[0, :] = np.array([dst_w * 0.5, dst_h * 0.5])
+        dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+
+        src[2, :] = self._get_ref_point(src[0, :], src[1, :])
+        dst[2, :] = self._get_ref_point(dst[0, :], dst[1, :])
+
+        get_matrix = cv2.getAffineTransform(src, dst)
+
+        matrix = np.concatenate((get_matrix, [[0., 0., 1.]]))
+
+        return matrix.astype(np.float32)
+
+    def _get_ref_point(self, ref_point1, ref_point2):
+        """Get reference point to calculate affine transform matrix.
+
+        While using opencv to calculate the affine matrix, we need at least
+        three corresponding points separately on original image and target
+        image. Here we use two points to get the the third reference point.
+        """
+        d = ref_point1 - ref_point2
+        ref_point3 = ref_point2 + np.array([-d[1], d[0]])
+        return ref_point3
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'down_ratio={self.down_ratio}) '
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomShiftScale(object):
+    """Random shift scale.
+
+    Different from the normal shift and scale function, it doesn't
+    directly shift or scale image. It can record the shift and scale
+    infos into loading pipelines. It's designed to be used with
+    AffineResize together.
+
+    Args:
+        shift_scale (tuple[float]): Shift and scale range.
+        aug_prob (float): The shifting and scaling probability.
+    """
+
+    def __init__(self, shift_scale, aug_prob):
+
+        self.shift_scale = shift_scale
+        self.aug_prob = aug_prob
+
+    def __call__(self, results):
+        """Call function to record random shift and scale infos.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after random shift and scale, 'center', 'size'
+                and 'affine_aug' keys are added in the result dict.
+        """
+        img = results['img']
+
+        height, width = img.shape[:2]
+
+        center = np.array([width / 2, height / 2], dtype=np.float32)
+        size = np.array([width, height], dtype=np.float32)
+
+        if random.random() < self.aug_prob:
+            shift, scale = self.shift_scale[0], self.shift_scale[1]
+            shift_ranges = np.arange(-shift, shift + 0.1, 0.1)
+            center[0] += size[0] * random.choice(shift_ranges)
+            center[1] += size[1] * random.choice(shift_ranges)
+            scale_ranges = np.arange(1 - scale, 1 + scale + 0.1, 0.1)
+            size *= random.choice(scale_ranges)
+            results['affine_aug'] = True
+        else:
+            results['affine_aug'] = False
+
+        results['center'] = center
+        results['size'] = size
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(shift_scale={self.shift_scale}, '
+        repr_str += f'aug_prob={self.aug_prob}) '
+        return repr_str
diff --git a/mmdet3d/datasets/pipelines/__init__.py b/mmdet3d/datasets/pipelines/__init__.py
index 317f605..76c43a6 100644
--- a/mmdet3d/datasets/pipelines/__init__.py
+++ b/mmdet3d/datasets/pipelines/__init__.py
@@ -1,34 +1,34 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .compose import Compose
-from .dbsampler import DataBaseSampler
-from .formating import Collect3D, DefaultFormatBundle, DefaultFormatBundle3D
-from .loading import (LoadAnnotations3D, LoadImageFromFileMono3D,
-                      LoadMultiViewImageFromFiles, LoadPointsFromDict,
-                      LoadPointsFromFile, LoadPointsFromMultiSweeps,
-                      NormalizePointsColor, PointSegClassMapping)
-from .test_time_aug import MultiScaleFlipAug3D
-# yapf: disable
-from .transforms_3d import (AffineResize, BackgroundPointsFilter,
-                            GlobalAlignment, GlobalRotScaleTrans,
-                            IndoorPatchPointSample, IndoorPointSample,
-                            MultiViewWrapper, ObjectNameFilter, ObjectNoise,
-                            ObjectRangeFilter, ObjectSample, PointSample,
-                            PointShuffle, PointsRangeFilter,
-                            RandomDropPointsColor, RandomFlip3D,
-                            RandomJitterPoints, RandomRotate, RandomShiftScale,
-                            RangeLimitedRandomCrop, VoxelBasedPointSampler)
-
-__all__ = [
-    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
-    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
-    'Compose', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
-    'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler',
-    'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample',
-    'PointSample', 'PointSegClassMapping', 'MultiScaleFlipAug3D',
-    'LoadPointsFromMultiSweeps', 'BackgroundPointsFilter',
-    'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample',
-    'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor',
-    'RandomJitterPoints', 'AffineResize', 'RandomShiftScale',
-    'LoadPointsFromDict', 'MultiViewWrapper', 'RandomRotate',
-    'RangeLimitedRandomCrop'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .compose import Compose
+from .dbsampler import DataBaseSampler
+from .formating import Collect3D, DefaultFormatBundle, DefaultFormatBundle3D
+from .loading import (LoadAnnotations3D, LoadImageFromFileMono3D,
+                      LoadMultiViewImageFromFiles, LoadPointsFromDict,
+                      LoadPointsFromFile, LoadPointsFromMultiSweeps,
+                      NormalizePointsColor, PointSegClassMapping)
+from .test_time_aug import MultiScaleFlipAug3D
+# yapf: disable
+from .transforms_3d import (AffineResize, BackgroundPointsFilter,
+                            GlobalAlignment, GlobalRotScaleTrans,
+                            IndoorPatchPointSample, IndoorPointSample,
+                            MultiViewWrapper, ObjectNameFilter, ObjectNoise,
+                            ObjectRangeFilter, ObjectSample, PointSample,
+                            PointShuffle, PointsRangeFilter,
+                            RandomDropPointsColor, RandomFlip3D,
+                            RandomJitterPoints, RandomRotate, RandomShiftScale,
+                            RangeLimitedRandomCrop, VoxelBasedPointSampler)
+
+__all__ = [
+    'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
+    'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
+    'Compose', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
+    'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler',
+    'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample',
+    'PointSample', 'PointSegClassMapping', 'MultiScaleFlipAug3D',
+    'LoadPointsFromMultiSweeps', 'BackgroundPointsFilter',
+    'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample',
+    'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor',
+    'RandomJitterPoints', 'AffineResize', 'RandomShiftScale',
+    'LoadPointsFromDict', 'MultiViewWrapper', 'RandomRotate',
+    'RangeLimitedRandomCrop'
+]
diff --git a/mmdet3d/datasets/pipelines/compose.py b/mmdet3d/datasets/pipelines/compose.py
index 9ab25d9..8c061bb 100644
--- a/mmdet3d/datasets/pipelines/compose.py
+++ b/mmdet3d/datasets/pipelines/compose.py
@@ -1,60 +1,60 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import collections
-
-from mmcv.utils import build_from_cfg
-
-from mmdet.datasets.builder import PIPELINES as MMDET_PIPELINES
-from ..builder import PIPELINES
-
-
-@PIPELINES.register_module()
-class Compose:
-    """Compose multiple transforms sequentially. The pipeline registry of
-    mmdet3d separates with mmdet, however, sometimes we may need to use mmdet's
-    pipeline. So the class is rewritten to be able to use pipelines from both
-    mmdet3d and mmdet.
-
-    Args:
-        transforms (Sequence[dict | callable]): Sequence of transform object or
-            config dict to be composed.
-    """
-
-    def __init__(self, transforms):
-        assert isinstance(transforms, collections.abc.Sequence)
-        self.transforms = []
-        for transform in transforms:
-            if isinstance(transform, dict):
-                _, key = PIPELINES.split_scope_key(transform['type'])
-                if key in PIPELINES._module_dict.keys():
-                    transform = build_from_cfg(transform, PIPELINES)
-                else:
-                    transform = build_from_cfg(transform, MMDET_PIPELINES)
-                self.transforms.append(transform)
-            elif callable(transform):
-                self.transforms.append(transform)
-            else:
-                raise TypeError('transform must be callable or a dict')
-
-    def __call__(self, data):
-        """Call function to apply transforms sequentially.
-
-        Args:
-            data (dict): A result dict contains the data to transform.
-
-        Returns:
-           dict: Transformed data.
-        """
-
-        for t in self.transforms:
-            data = t(data)
-            if data is None:
-                return None
-        return data
-
-    def __repr__(self):
-        format_string = self.__class__.__name__ + '('
-        for t in self.transforms:
-            format_string += '\n'
-            format_string += f'    {t}'
-        format_string += '\n)'
-        return format_string
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+
+from mmcv.utils import build_from_cfg
+
+from mmdet.datasets.builder import PIPELINES as MMDET_PIPELINES
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class Compose:
+    """Compose multiple transforms sequentially. The pipeline registry of
+    mmdet3d separates with mmdet, however, sometimes we may need to use mmdet's
+    pipeline. So the class is rewritten to be able to use pipelines from both
+    mmdet3d and mmdet.
+
+    Args:
+        transforms (Sequence[dict | callable]): Sequence of transform object or
+            config dict to be composed.
+    """
+
+    def __init__(self, transforms):
+        assert isinstance(transforms, collections.abc.Sequence)
+        self.transforms = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                _, key = PIPELINES.split_scope_key(transform['type'])
+                if key in PIPELINES._module_dict.keys():
+                    transform = build_from_cfg(transform, PIPELINES)
+                else:
+                    transform = build_from_cfg(transform, MMDET_PIPELINES)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict')
+
+    def __call__(self, data):
+        """Call function to apply transforms sequentially.
+
+        Args:
+            data (dict): A result dict contains the data to transform.
+
+        Returns:
+           dict: Transformed data.
+        """
+
+        for t in self.transforms:
+            data = t(data)
+            if data is None:
+                return None
+        return data
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += f'    {t}'
+        format_string += '\n)'
+        return format_string
diff --git a/mmdet3d/datasets/pipelines/data_augment_utils.py b/mmdet3d/datasets/pipelines/data_augment_utils.py
index 21be3c0..785df15 100644
--- a/mmdet3d/datasets/pipelines/data_augment_utils.py
+++ b/mmdet3d/datasets/pipelines/data_augment_utils.py
@@ -1,411 +1,411 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import numba
-import numpy as np
-from numba.core.errors import NumbaPerformanceWarning
-
-from mmdet3d.core.bbox import box_np_ops
-
-warnings.filterwarnings('ignore', category=NumbaPerformanceWarning)
-
-
-@numba.njit
-def _rotation_box2d_jit_(corners, angle, rot_mat_T):
-    """Rotate 2D boxes.
-
-    Args:
-        corners (np.ndarray): Corners of boxes.
-        angle (float): Rotation angle.
-        rot_mat_T (np.ndarray): Transposed rotation matrix.
-    """
-    rot_sin = np.sin(angle)
-    rot_cos = np.cos(angle)
-    rot_mat_T[0, 0] = rot_cos
-    rot_mat_T[0, 1] = rot_sin
-    rot_mat_T[1, 0] = -rot_sin
-    rot_mat_T[1, 1] = rot_cos
-    corners[:] = corners @ rot_mat_T
-
-
-@numba.jit(nopython=True)
-def box_collision_test(boxes, qboxes, clockwise=True):
-    """Box collision test.
-
-    Args:
-        boxes (np.ndarray): Corners of current boxes.
-        qboxes (np.ndarray): Boxes to be avoid colliding.
-        clockwise (bool, optional): Whether the corners are in
-            clockwise order. Default: True.
-    """
-    N = boxes.shape[0]
-    K = qboxes.shape[0]
-    ret = np.zeros((N, K), dtype=np.bool_)
-    slices = np.array([1, 2, 3, 0])
-    lines_boxes = np.stack((boxes, boxes[:, slices, :]),
-                           axis=2)  # [N, 4, 2(line), 2(xy)]
-    lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2)
-    # vec = np.zeros((2,), dtype=boxes.dtype)
-    boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes)
-    qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes)
-    for i in range(N):
-        for j in range(K):
-            # calculate standup first
-            iw = (
-                min(boxes_standup[i, 2], qboxes_standup[j, 2]) -
-                max(boxes_standup[i, 0], qboxes_standup[j, 0]))
-            if iw > 0:
-                ih = (
-                    min(boxes_standup[i, 3], qboxes_standup[j, 3]) -
-                    max(boxes_standup[i, 1], qboxes_standup[j, 1]))
-                if ih > 0:
-                    for k in range(4):
-                        for box_l in range(4):
-                            A = lines_boxes[i, k, 0]
-                            B = lines_boxes[i, k, 1]
-                            C = lines_qboxes[j, box_l, 0]
-                            D = lines_qboxes[j, box_l, 1]
-                            acd = (D[1] - A[1]) * (C[0] -
-                                                   A[0]) > (C[1] - A[1]) * (
-                                                       D[0] - A[0])
-                            bcd = (D[1] - B[1]) * (C[0] -
-                                                   B[0]) > (C[1] - B[1]) * (
-                                                       D[0] - B[0])
-                            if acd != bcd:
-                                abc = (C[1] - A[1]) * (B[0] - A[0]) > (
-                                    B[1] - A[1]) * (
-                                        C[0] - A[0])
-                                abd = (D[1] - A[1]) * (B[0] - A[0]) > (
-                                    B[1] - A[1]) * (
-                                        D[0] - A[0])
-                                if abc != abd:
-                                    ret[i, j] = True  # collision.
-                                    break
-                        if ret[i, j] is True:
-                            break
-                    if ret[i, j] is False:
-                        # now check complete overlap.
-                        # box overlap qbox:
-                        box_overlap_qbox = True
-                        for box_l in range(4):  # point l in qboxes
-                            for k in range(4):  # corner k in boxes
-                                vec = boxes[i, k] - boxes[i, (k + 1) % 4]
-                                if clockwise:
-                                    vec = -vec
-                                cross = vec[1] * (
-                                    boxes[i, k, 0] - qboxes[j, box_l, 0])
-                                cross -= vec[0] * (
-                                    boxes[i, k, 1] - qboxes[j, box_l, 1])
-                                if cross >= 0:
-                                    box_overlap_qbox = False
-                                    break
-                            if box_overlap_qbox is False:
-                                break
-
-                        if box_overlap_qbox is False:
-                            qbox_overlap_box = True
-                            for box_l in range(4):  # point box_l in boxes
-                                for k in range(4):  # corner k in qboxes
-                                    vec = qboxes[j, k] - qboxes[j, (k + 1) % 4]
-                                    if clockwise:
-                                        vec = -vec
-                                    cross = vec[1] * (
-                                        qboxes[j, k, 0] - boxes[i, box_l, 0])
-                                    cross -= vec[0] * (
-                                        qboxes[j, k, 1] - boxes[i, box_l, 1])
-                                    if cross >= 0:  #
-                                        qbox_overlap_box = False
-                                        break
-                                if qbox_overlap_box is False:
-                                    break
-                            if qbox_overlap_box:
-                                ret[i, j] = True  # collision.
-                        else:
-                            ret[i, j] = True  # collision.
-    return ret
-
-
-@numba.njit
-def noise_per_box(boxes, valid_mask, loc_noises, rot_noises):
-    """Add noise to every box (only on the horizontal plane).
-
-    Args:
-        boxes (np.ndarray): Input boxes with shape (N, 5).
-        valid_mask (np.ndarray): Mask to indicate which boxes are valid
-            with shape (N).
-        loc_noises (np.ndarray): Location noises with shape (N, M, 3).
-        rot_noises (np.ndarray): Rotation noises with shape (N, M).
-
-    Returns:
-        np.ndarray: Mask to indicate whether the noise is
-            added successfully (pass the collision test).
-    """
-    num_boxes = boxes.shape[0]
-    num_tests = loc_noises.shape[1]
-    box_corners = box_np_ops.box2d_to_corner_jit(boxes)
-    current_corners = np.zeros((4, 2), dtype=boxes.dtype)
-    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
-    success_mask = -np.ones((num_boxes, ), dtype=np.int64)
-    # print(valid_mask)
-    for i in range(num_boxes):
-        if valid_mask[i]:
-            for j in range(num_tests):
-                current_corners[:] = box_corners[i]
-                current_corners -= boxes[i, :2]
-                _rotation_box2d_jit_(current_corners, rot_noises[i, j],
-                                     rot_mat_T)
-                current_corners += boxes[i, :2] + loc_noises[i, j, :2]
-                coll_mat = box_collision_test(
-                    current_corners.reshape(1, 4, 2), box_corners)
-                coll_mat[0, i] = False
-                # print(coll_mat)
-                if not coll_mat.any():
-                    success_mask[i] = j
-                    box_corners[i] = current_corners
-                    break
-    return success_mask
-
-
-@numba.njit
-def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises,
-                      global_rot_noises):
-    """Add noise to every box (only on the horizontal plane). Version 2 used
-    when enable global rotations.
-
-    Args:
-        boxes (np.ndarray): Input boxes with shape (N, 5).
-        valid_mask (np.ndarray): Mask to indicate which boxes are valid
-            with shape (N).
-        loc_noises (np.ndarray): Location noises with shape (N, M, 3).
-        rot_noises (np.ndarray): Rotation noises with shape (N, M).
-
-    Returns:
-        np.ndarray: Mask to indicate whether the noise is
-            added successfully (pass the collision test).
-    """
-    num_boxes = boxes.shape[0]
-    num_tests = loc_noises.shape[1]
-    box_corners = box_np_ops.box2d_to_corner_jit(boxes)
-    current_corners = np.zeros((4, 2), dtype=boxes.dtype)
-    current_box = np.zeros((1, 5), dtype=boxes.dtype)
-    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
-    dst_pos = np.zeros((2, ), dtype=boxes.dtype)
-    success_mask = -np.ones((num_boxes, ), dtype=np.int64)
-    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
-    corners_norm[1, 1] = 1.0
-    corners_norm[2] = 1.0
-    corners_norm[3, 0] = 1.0
-    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
-    corners_norm = corners_norm.reshape(4, 2)
-    for i in range(num_boxes):
-        if valid_mask[i]:
-            for j in range(num_tests):
-                current_box[0, :] = boxes[i]
-                current_radius = np.sqrt(boxes[i, 0]**2 + boxes[i, 1]**2)
-                current_grot = np.arctan2(boxes[i, 0], boxes[i, 1])
-                dst_grot = current_grot + global_rot_noises[i, j]
-                dst_pos[0] = current_radius * np.sin(dst_grot)
-                dst_pos[1] = current_radius * np.cos(dst_grot)
-                current_box[0, :2] = dst_pos
-                current_box[0, -1] += (dst_grot - current_grot)
-
-                rot_sin = np.sin(current_box[0, -1])
-                rot_cos = np.cos(current_box[0, -1])
-                rot_mat_T[0, 0] = rot_cos
-                rot_mat_T[0, 1] = rot_sin
-                rot_mat_T[1, 0] = -rot_sin
-                rot_mat_T[1, 1] = rot_cos
-                current_corners[:] = current_box[
-                    0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2]
-                current_corners -= current_box[0, :2]
-                _rotation_box2d_jit_(current_corners, rot_noises[i, j],
-                                     rot_mat_T)
-                current_corners += current_box[0, :2] + loc_noises[i, j, :2]
-                coll_mat = box_collision_test(
-                    current_corners.reshape(1, 4, 2), box_corners)
-                coll_mat[0, i] = False
-                if not coll_mat.any():
-                    success_mask[i] = j
-                    box_corners[i] = current_corners
-                    loc_noises[i, j, :2] += (dst_pos - boxes[i, :2])
-                    rot_noises[i, j] += (dst_grot - current_grot)
-                    break
-    return success_mask
-
-
-def _select_transform(transform, indices):
-    """Select transform.
-
-    Args:
-        transform (np.ndarray): Transforms to select from.
-        indices (np.ndarray): Mask to indicate which transform to select.
-
-    Returns:
-        np.ndarray: Selected transforms.
-    """
-    result = np.zeros((transform.shape[0], *transform.shape[2:]),
-                      dtype=transform.dtype)
-    for i in range(transform.shape[0]):
-        if indices[i] != -1:
-            result[i] = transform[i, indices[i]]
-    return result
-
-
-@numba.njit
-def _rotation_matrix_3d_(rot_mat_T, angle, axis):
-    """Get the 3D rotation matrix.
-
-    Args:
-        rot_mat_T (np.ndarray): Transposed rotation matrix.
-        angle (float): Rotation angle.
-        axis (int): Rotation axis.
-    """
-    rot_sin = np.sin(angle)
-    rot_cos = np.cos(angle)
-    rot_mat_T[:] = np.eye(3)
-    if axis == 1:
-        rot_mat_T[0, 0] = rot_cos
-        rot_mat_T[0, 2] = rot_sin
-        rot_mat_T[2, 0] = -rot_sin
-        rot_mat_T[2, 2] = rot_cos
-    elif axis == 2 or axis == -1:
-        rot_mat_T[0, 0] = rot_cos
-        rot_mat_T[0, 1] = rot_sin
-        rot_mat_T[1, 0] = -rot_sin
-        rot_mat_T[1, 1] = rot_cos
-    elif axis == 0:
-        rot_mat_T[1, 1] = rot_cos
-        rot_mat_T[1, 2] = rot_sin
-        rot_mat_T[2, 1] = -rot_sin
-        rot_mat_T[2, 2] = rot_cos
-
-
-@numba.njit
-def points_transform_(points, centers, point_masks, loc_transform,
-                      rot_transform, valid_mask):
-    """Apply transforms to points and box centers.
-
-    Args:
-        points (np.ndarray): Input points.
-        centers (np.ndarray): Input box centers.
-        point_masks (np.ndarray): Mask to indicate which points need
-            to be transformed.
-        loc_transform (np.ndarray): Location transform to be applied.
-        rot_transform (np.ndarray): Rotation transform to be applied.
-        valid_mask (np.ndarray): Mask to indicate which boxes are valid.
-    """
-    num_box = centers.shape[0]
-    num_points = points.shape[0]
-    rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype)
-    for i in range(num_box):
-        _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2)
-    for i in range(num_points):
-        for j in range(num_box):
-            if valid_mask[j]:
-                if point_masks[i, j] == 1:
-                    points[i, :3] -= centers[j, :3]
-                    points[i:i + 1, :3] = points[i:i + 1, :3] @ rot_mat_T[j]
-                    points[i, :3] += centers[j, :3]
-                    points[i, :3] += loc_transform[j]
-                    break  # only apply first box's transform
-
-
-@numba.njit
-def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask):
-    """Transform 3D boxes.
-
-    Args:
-        boxes (np.ndarray): 3D boxes to be transformed.
-        loc_transform (np.ndarray): Location transform to be applied.
-        rot_transform (np.ndarray): Rotation transform to be applied.
-        valid_mask (np.ndarray): Mask to indicate which boxes are valid.
-    """
-    num_box = boxes.shape[0]
-    for i in range(num_box):
-        if valid_mask[i]:
-            boxes[i, :3] += loc_transform[i]
-            boxes[i, 6] += rot_transform[i]
-
-
-def noise_per_object_v3_(gt_boxes,
-                         points=None,
-                         valid_mask=None,
-                         rotation_perturb=np.pi / 4,
-                         center_noise_std=1.0,
-                         global_random_rot_range=np.pi / 4,
-                         num_try=100):
-    """Random rotate or remove each groundtruth independently. use kitti viewer
-    to test this function points_transform_
-
-    Args:
-        gt_boxes (np.ndarray): Ground truth boxes with shape (N, 7).
-        points (np.ndarray, optional): Input point cloud with
-            shape (M, 4). Default: None.
-        valid_mask (np.ndarray, optional): Mask to indicate which
-            boxes are valid. Default: None.
-        rotation_perturb (float, optional): Rotation perturbation.
-            Default: pi / 4.
-        center_noise_std (float, optional): Center noise standard deviation.
-            Default: 1.0.
-        global_random_rot_range (float, optional): Global random rotation
-            range. Default: pi/4.
-        num_try (int, optional): Number of try. Default: 100.
-    """
-    num_boxes = gt_boxes.shape[0]
-    if not isinstance(rotation_perturb, (list, tuple, np.ndarray)):
-        rotation_perturb = [-rotation_perturb, rotation_perturb]
-    if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)):
-        global_random_rot_range = [
-            -global_random_rot_range, global_random_rot_range
-        ]
-    enable_grot = np.abs(global_random_rot_range[0] -
-                         global_random_rot_range[1]) >= 1e-3
-
-    if not isinstance(center_noise_std, (list, tuple, np.ndarray)):
-        center_noise_std = [
-            center_noise_std, center_noise_std, center_noise_std
-        ]
-    if valid_mask is None:
-        valid_mask = np.ones((num_boxes, ), dtype=np.bool_)
-    center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype)
-
-    loc_noises = np.random.normal(
-        scale=center_noise_std, size=[num_boxes, num_try, 3])
-    rot_noises = np.random.uniform(
-        rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try])
-    gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1])
-    grot_lowers = global_random_rot_range[0] - gt_grots
-    grot_uppers = global_random_rot_range[1] - gt_grots
-    global_rot_noises = np.random.uniform(
-        grot_lowers[..., np.newaxis],
-        grot_uppers[..., np.newaxis],
-        size=[num_boxes, num_try])
-
-    origin = (0.5, 0.5, 0)
-    gt_box_corners = box_np_ops.center_to_corner_box3d(
-        gt_boxes[:, :3],
-        gt_boxes[:, 3:6],
-        gt_boxes[:, 6],
-        origin=origin,
-        axis=2)
-
-    # TODO: rewrite this noise box function?
-    if not enable_grot:
-        selected_noise = noise_per_box(gt_boxes[:, [0, 1, 3, 4, 6]],
-                                       valid_mask, loc_noises, rot_noises)
-    else:
-        selected_noise = noise_per_box_v2_(gt_boxes[:, [0, 1, 3, 4, 6]],
-                                           valid_mask, loc_noises, rot_noises,
-                                           global_rot_noises)
-
-    loc_transforms = _select_transform(loc_noises, selected_noise)
-    rot_transforms = _select_transform(rot_noises, selected_noise)
-    surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners)
-    if points is not None:
-        # TODO: replace this points_in_convex function by my tools?
-        point_masks = box_np_ops.points_in_convex_polygon_3d_jit(
-            points[:, :3], surfaces)
-        points_transform_(points, gt_boxes[:, :3], point_masks, loc_transforms,
-                          rot_transforms, valid_mask)
-
-    box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask)
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numba
+import numpy as np
+from numba.core.errors import NumbaPerformanceWarning
+
+from mmdet3d.core.bbox import box_np_ops
+
+warnings.filterwarnings('ignore', category=NumbaPerformanceWarning)
+
+
+@numba.njit
+def _rotation_box2d_jit_(corners, angle, rot_mat_T):
+    """Rotate 2D boxes.
+
+    Args:
+        corners (np.ndarray): Corners of boxes.
+        angle (float): Rotation angle.
+        rot_mat_T (np.ndarray): Transposed rotation matrix.
+    """
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    rot_mat_T[0, 0] = rot_cos
+    rot_mat_T[0, 1] = rot_sin
+    rot_mat_T[1, 0] = -rot_sin
+    rot_mat_T[1, 1] = rot_cos
+    corners[:] = corners @ rot_mat_T
+
+
+@numba.jit(nopython=True)
+def box_collision_test(boxes, qboxes, clockwise=True):
+    """Box collision test.
+
+    Args:
+        boxes (np.ndarray): Corners of current boxes.
+        qboxes (np.ndarray): Boxes to be avoid colliding.
+        clockwise (bool, optional): Whether the corners are in
+            clockwise order. Default: True.
+    """
+    N = boxes.shape[0]
+    K = qboxes.shape[0]
+    ret = np.zeros((N, K), dtype=np.bool_)
+    slices = np.array([1, 2, 3, 0])
+    lines_boxes = np.stack((boxes, boxes[:, slices, :]),
+                           axis=2)  # [N, 4, 2(line), 2(xy)]
+    lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2)
+    # vec = np.zeros((2,), dtype=boxes.dtype)
+    boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes)
+    qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes)
+    for i in range(N):
+        for j in range(K):
+            # calculate standup first
+            iw = (
+                min(boxes_standup[i, 2], qboxes_standup[j, 2]) -
+                max(boxes_standup[i, 0], qboxes_standup[j, 0]))
+            if iw > 0:
+                ih = (
+                    min(boxes_standup[i, 3], qboxes_standup[j, 3]) -
+                    max(boxes_standup[i, 1], qboxes_standup[j, 1]))
+                if ih > 0:
+                    for k in range(4):
+                        for box_l in range(4):
+                            A = lines_boxes[i, k, 0]
+                            B = lines_boxes[i, k, 1]
+                            C = lines_qboxes[j, box_l, 0]
+                            D = lines_qboxes[j, box_l, 1]
+                            acd = (D[1] - A[1]) * (C[0] -
+                                                   A[0]) > (C[1] - A[1]) * (
+                                                       D[0] - A[0])
+                            bcd = (D[1] - B[1]) * (C[0] -
+                                                   B[0]) > (C[1] - B[1]) * (
+                                                       D[0] - B[0])
+                            if acd != bcd:
+                                abc = (C[1] - A[1]) * (B[0] - A[0]) > (
+                                    B[1] - A[1]) * (
+                                        C[0] - A[0])
+                                abd = (D[1] - A[1]) * (B[0] - A[0]) > (
+                                    B[1] - A[1]) * (
+                                        D[0] - A[0])
+                                if abc != abd:
+                                    ret[i, j] = True  # collision.
+                                    break
+                        if ret[i, j] is True:
+                            break
+                    if ret[i, j] is False:
+                        # now check complete overlap.
+                        # box overlap qbox:
+                        box_overlap_qbox = True
+                        for box_l in range(4):  # point l in qboxes
+                            for k in range(4):  # corner k in boxes
+                                vec = boxes[i, k] - boxes[i, (k + 1) % 4]
+                                if clockwise:
+                                    vec = -vec
+                                cross = vec[1] * (
+                                    boxes[i, k, 0] - qboxes[j, box_l, 0])
+                                cross -= vec[0] * (
+                                    boxes[i, k, 1] - qboxes[j, box_l, 1])
+                                if cross >= 0:
+                                    box_overlap_qbox = False
+                                    break
+                            if box_overlap_qbox is False:
+                                break
+
+                        if box_overlap_qbox is False:
+                            qbox_overlap_box = True
+                            for box_l in range(4):  # point box_l in boxes
+                                for k in range(4):  # corner k in qboxes
+                                    vec = qboxes[j, k] - qboxes[j, (k + 1) % 4]
+                                    if clockwise:
+                                        vec = -vec
+                                    cross = vec[1] * (
+                                        qboxes[j, k, 0] - boxes[i, box_l, 0])
+                                    cross -= vec[0] * (
+                                        qboxes[j, k, 1] - boxes[i, box_l, 1])
+                                    if cross >= 0:  #
+                                        qbox_overlap_box = False
+                                        break
+                                if qbox_overlap_box is False:
+                                    break
+                            if qbox_overlap_box:
+                                ret[i, j] = True  # collision.
+                        else:
+                            ret[i, j] = True  # collision.
+    return ret
+
+
+@numba.njit
+def noise_per_box(boxes, valid_mask, loc_noises, rot_noises):
+    """Add noise to every box (only on the horizontal plane).
+
+    Args:
+        boxes (np.ndarray): Input boxes with shape (N, 5).
+        valid_mask (np.ndarray): Mask to indicate which boxes are valid
+            with shape (N).
+        loc_noises (np.ndarray): Location noises with shape (N, M, 3).
+        rot_noises (np.ndarray): Rotation noises with shape (N, M).
+
+    Returns:
+        np.ndarray: Mask to indicate whether the noise is
+            added successfully (pass the collision test).
+    """
+    num_boxes = boxes.shape[0]
+    num_tests = loc_noises.shape[1]
+    box_corners = box_np_ops.box2d_to_corner_jit(boxes)
+    current_corners = np.zeros((4, 2), dtype=boxes.dtype)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    success_mask = -np.ones((num_boxes, ), dtype=np.int64)
+    # print(valid_mask)
+    for i in range(num_boxes):
+        if valid_mask[i]:
+            for j in range(num_tests):
+                current_corners[:] = box_corners[i]
+                current_corners -= boxes[i, :2]
+                _rotation_box2d_jit_(current_corners, rot_noises[i, j],
+                                     rot_mat_T)
+                current_corners += boxes[i, :2] + loc_noises[i, j, :2]
+                coll_mat = box_collision_test(
+                    current_corners.reshape(1, 4, 2), box_corners)
+                coll_mat[0, i] = False
+                # print(coll_mat)
+                if not coll_mat.any():
+                    success_mask[i] = j
+                    box_corners[i] = current_corners
+                    break
+    return success_mask
+
+
+@numba.njit
+def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises,
+                      global_rot_noises):
+    """Add noise to every box (only on the horizontal plane). Version 2 used
+    when enable global rotations.
+
+    Args:
+        boxes (np.ndarray): Input boxes with shape (N, 5).
+        valid_mask (np.ndarray): Mask to indicate which boxes are valid
+            with shape (N).
+        loc_noises (np.ndarray): Location noises with shape (N, M, 3).
+        rot_noises (np.ndarray): Rotation noises with shape (N, M).
+
+    Returns:
+        np.ndarray: Mask to indicate whether the noise is
+            added successfully (pass the collision test).
+    """
+    num_boxes = boxes.shape[0]
+    num_tests = loc_noises.shape[1]
+    box_corners = box_np_ops.box2d_to_corner_jit(boxes)
+    current_corners = np.zeros((4, 2), dtype=boxes.dtype)
+    current_box = np.zeros((1, 5), dtype=boxes.dtype)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    dst_pos = np.zeros((2, ), dtype=boxes.dtype)
+    success_mask = -np.ones((num_boxes, ), dtype=np.int64)
+    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
+    corners_norm[1, 1] = 1.0
+    corners_norm[2] = 1.0
+    corners_norm[3, 0] = 1.0
+    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
+    corners_norm = corners_norm.reshape(4, 2)
+    for i in range(num_boxes):
+        if valid_mask[i]:
+            for j in range(num_tests):
+                current_box[0, :] = boxes[i]
+                current_radius = np.sqrt(boxes[i, 0]**2 + boxes[i, 1]**2)
+                current_grot = np.arctan2(boxes[i, 0], boxes[i, 1])
+                dst_grot = current_grot + global_rot_noises[i, j]
+                dst_pos[0] = current_radius * np.sin(dst_grot)
+                dst_pos[1] = current_radius * np.cos(dst_grot)
+                current_box[0, :2] = dst_pos
+                current_box[0, -1] += (dst_grot - current_grot)
+
+                rot_sin = np.sin(current_box[0, -1])
+                rot_cos = np.cos(current_box[0, -1])
+                rot_mat_T[0, 0] = rot_cos
+                rot_mat_T[0, 1] = rot_sin
+                rot_mat_T[1, 0] = -rot_sin
+                rot_mat_T[1, 1] = rot_cos
+                current_corners[:] = current_box[
+                    0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2]
+                current_corners -= current_box[0, :2]
+                _rotation_box2d_jit_(current_corners, rot_noises[i, j],
+                                     rot_mat_T)
+                current_corners += current_box[0, :2] + loc_noises[i, j, :2]
+                coll_mat = box_collision_test(
+                    current_corners.reshape(1, 4, 2), box_corners)
+                coll_mat[0, i] = False
+                if not coll_mat.any():
+                    success_mask[i] = j
+                    box_corners[i] = current_corners
+                    loc_noises[i, j, :2] += (dst_pos - boxes[i, :2])
+                    rot_noises[i, j] += (dst_grot - current_grot)
+                    break
+    return success_mask
+
+
+def _select_transform(transform, indices):
+    """Select transform.
+
+    Args:
+        transform (np.ndarray): Transforms to select from.
+        indices (np.ndarray): Mask to indicate which transform to select.
+
+    Returns:
+        np.ndarray: Selected transforms.
+    """
+    result = np.zeros((transform.shape[0], *transform.shape[2:]),
+                      dtype=transform.dtype)
+    for i in range(transform.shape[0]):
+        if indices[i] != -1:
+            result[i] = transform[i, indices[i]]
+    return result
+
+
+@numba.njit
+def _rotation_matrix_3d_(rot_mat_T, angle, axis):
+    """Get the 3D rotation matrix.
+
+    Args:
+        rot_mat_T (np.ndarray): Transposed rotation matrix.
+        angle (float): Rotation angle.
+        axis (int): Rotation axis.
+    """
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    rot_mat_T[:] = np.eye(3)
+    if axis == 1:
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 2] = rot_sin
+        rot_mat_T[2, 0] = -rot_sin
+        rot_mat_T[2, 2] = rot_cos
+    elif axis == 2 or axis == -1:
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 1] = rot_sin
+        rot_mat_T[1, 0] = -rot_sin
+        rot_mat_T[1, 1] = rot_cos
+    elif axis == 0:
+        rot_mat_T[1, 1] = rot_cos
+        rot_mat_T[1, 2] = rot_sin
+        rot_mat_T[2, 1] = -rot_sin
+        rot_mat_T[2, 2] = rot_cos
+
+
+@numba.njit
+def points_transform_(points, centers, point_masks, loc_transform,
+                      rot_transform, valid_mask):
+    """Apply transforms to points and box centers.
+
+    Args:
+        points (np.ndarray): Input points.
+        centers (np.ndarray): Input box centers.
+        point_masks (np.ndarray): Mask to indicate which points need
+            to be transformed.
+        loc_transform (np.ndarray): Location transform to be applied.
+        rot_transform (np.ndarray): Rotation transform to be applied.
+        valid_mask (np.ndarray): Mask to indicate which boxes are valid.
+    """
+    num_box = centers.shape[0]
+    num_points = points.shape[0]
+    rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype)
+    for i in range(num_box):
+        _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2)
+    for i in range(num_points):
+        for j in range(num_box):
+            if valid_mask[j]:
+                if point_masks[i, j] == 1:
+                    points[i, :3] -= centers[j, :3]
+                    points[i:i + 1, :3] = points[i:i + 1, :3] @ rot_mat_T[j]
+                    points[i, :3] += centers[j, :3]
+                    points[i, :3] += loc_transform[j]
+                    break  # only apply first box's transform
+
+
+@numba.njit
+def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask):
+    """Transform 3D boxes.
+
+    Args:
+        boxes (np.ndarray): 3D boxes to be transformed.
+        loc_transform (np.ndarray): Location transform to be applied.
+        rot_transform (np.ndarray): Rotation transform to be applied.
+        valid_mask (np.ndarray): Mask to indicate which boxes are valid.
+    """
+    num_box = boxes.shape[0]
+    for i in range(num_box):
+        if valid_mask[i]:
+            boxes[i, :3] += loc_transform[i]
+            boxes[i, 6] += rot_transform[i]
+
+
+def noise_per_object_v3_(gt_boxes,
+                         points=None,
+                         valid_mask=None,
+                         rotation_perturb=np.pi / 4,
+                         center_noise_std=1.0,
+                         global_random_rot_range=np.pi / 4,
+                         num_try=100):
+    """Random rotate or remove each groundtruth independently. use kitti viewer
+    to test this function points_transform_
+
+    Args:
+        gt_boxes (np.ndarray): Ground truth boxes with shape (N, 7).
+        points (np.ndarray, optional): Input point cloud with
+            shape (M, 4). Default: None.
+        valid_mask (np.ndarray, optional): Mask to indicate which
+            boxes are valid. Default: None.
+        rotation_perturb (float, optional): Rotation perturbation.
+            Default: pi / 4.
+        center_noise_std (float, optional): Center noise standard deviation.
+            Default: 1.0.
+        global_random_rot_range (float, optional): Global random rotation
+            range. Default: pi/4.
+        num_try (int, optional): Number of try. Default: 100.
+    """
+    num_boxes = gt_boxes.shape[0]
+    if not isinstance(rotation_perturb, (list, tuple, np.ndarray)):
+        rotation_perturb = [-rotation_perturb, rotation_perturb]
+    if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)):
+        global_random_rot_range = [
+            -global_random_rot_range, global_random_rot_range
+        ]
+    enable_grot = np.abs(global_random_rot_range[0] -
+                         global_random_rot_range[1]) >= 1e-3
+
+    if not isinstance(center_noise_std, (list, tuple, np.ndarray)):
+        center_noise_std = [
+            center_noise_std, center_noise_std, center_noise_std
+        ]
+    if valid_mask is None:
+        valid_mask = np.ones((num_boxes, ), dtype=np.bool_)
+    center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype)
+
+    loc_noises = np.random.normal(
+        scale=center_noise_std, size=[num_boxes, num_try, 3])
+    rot_noises = np.random.uniform(
+        rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try])
+    gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1])
+    grot_lowers = global_random_rot_range[0] - gt_grots
+    grot_uppers = global_random_rot_range[1] - gt_grots
+    global_rot_noises = np.random.uniform(
+        grot_lowers[..., np.newaxis],
+        grot_uppers[..., np.newaxis],
+        size=[num_boxes, num_try])
+
+    origin = (0.5, 0.5, 0)
+    gt_box_corners = box_np_ops.center_to_corner_box3d(
+        gt_boxes[:, :3],
+        gt_boxes[:, 3:6],
+        gt_boxes[:, 6],
+        origin=origin,
+        axis=2)
+
+    # TODO: rewrite this noise box function?
+    if not enable_grot:
+        selected_noise = noise_per_box(gt_boxes[:, [0, 1, 3, 4, 6]],
+                                       valid_mask, loc_noises, rot_noises)
+    else:
+        selected_noise = noise_per_box_v2_(gt_boxes[:, [0, 1, 3, 4, 6]],
+                                           valid_mask, loc_noises, rot_noises,
+                                           global_rot_noises)
+
+    loc_transforms = _select_transform(loc_noises, selected_noise)
+    rot_transforms = _select_transform(rot_noises, selected_noise)
+    surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners)
+    if points is not None:
+        # TODO: replace this points_in_convex function by my tools?
+        point_masks = box_np_ops.points_in_convex_polygon_3d_jit(
+            points[:, :3], surfaces)
+        points_transform_(points, gt_boxes[:, :3], point_masks, loc_transforms,
+                          rot_transforms, valid_mask)
+
+    box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask)
diff --git a/mmdet3d/datasets/pipelines/dbsampler.py b/mmdet3d/datasets/pipelines/dbsampler.py
index ef82c88..69c92d7 100644
--- a/mmdet3d/datasets/pipelines/dbsampler.py
+++ b/mmdet3d/datasets/pipelines/dbsampler.py
@@ -1,340 +1,340 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import os
-import warnings
-
-import mmcv
-import numpy as np
-
-from mmdet3d.core.bbox import box_np_ops
-from mmdet3d.datasets.pipelines import data_augment_utils
-from ..builder import OBJECTSAMPLERS, PIPELINES
-
-
-class BatchSampler:
-    """Class for sampling specific category of ground truths.
-
-    Args:
-        sample_list (list[dict]): List of samples.
-        name (str, optional): The category of samples. Default: None.
-        epoch (int, optional): Sampling epoch. Default: None.
-        shuffle (bool, optional): Whether to shuffle indices. Default: False.
-        drop_reminder (bool, optional): Drop reminder. Default: False.
-    """
-
-    def __init__(self,
-                 sampled_list,
-                 name=None,
-                 epoch=None,
-                 shuffle=True,
-                 drop_reminder=False):
-        self._sampled_list = sampled_list
-        self._indices = np.arange(len(sampled_list))
-        if shuffle:
-            np.random.shuffle(self._indices)
-        self._idx = 0
-        self._example_num = len(sampled_list)
-        self._name = name
-        self._shuffle = shuffle
-        self._epoch = epoch
-        self._epoch_counter = 0
-        self._drop_reminder = drop_reminder
-
-    def _sample(self, num):
-        """Sample specific number of ground truths and return indices.
-
-        Args:
-            num (int): Sampled number.
-
-        Returns:
-            list[int]: Indices of sampled ground truths.
-        """
-        if self._idx + num >= self._example_num:
-            ret = self._indices[self._idx:].copy()
-            self._reset()
-        else:
-            ret = self._indices[self._idx:self._idx + num]
-            self._idx += num
-        return ret
-
-    def _reset(self):
-        """Reset the index of batchsampler to zero."""
-        assert self._name is not None
-        # print("reset", self._name)
-        if self._shuffle:
-            np.random.shuffle(self._indices)
-        self._idx = 0
-
-    def sample(self, num):
-        """Sample specific number of ground truths.
-
-        Args:
-            num (int): Sampled number.
-
-        Returns:
-            list[dict]: Sampled ground truths.
-        """
-        indices = self._sample(num)
-        return [self._sampled_list[i] for i in indices]
-
-
-@OBJECTSAMPLERS.register_module()
-class DataBaseSampler(object):
-    """Class for sampling data from the ground truth database.
-
-    Args:
-        info_path (str): Path of groundtruth database info.
-        data_root (str): Path of groundtruth database.
-        rate (float): Rate of actual sampled over maximum sampled number.
-        prepare (dict): Name of preparation functions and the input value.
-        sample_groups (dict): Sampled classes and numbers.
-        classes (list[str], optional): List of classes. Default: None.
-        points_loader(dict, optional): Config of points loader. Default:
-            dict(type='LoadPointsFromFile', load_dim=4, use_dim=[0,1,2,3])
-    """
-
-    def __init__(self,
-                 info_path,
-                 data_root,
-                 rate,
-                 prepare,
-                 sample_groups,
-                 classes=None,
-                 points_loader=dict(
-                     type='LoadPointsFromFile',
-                     coord_type='LIDAR',
-                     load_dim=4,
-                     use_dim=[0, 1, 2, 3]),
-                 file_client_args=dict(backend='disk')):
-        super().__init__()
-        self.data_root = data_root
-        self.info_path = info_path
-        self.rate = rate
-        self.prepare = prepare
-        self.classes = classes
-        self.cat2label = {name: i for i, name in enumerate(classes)}
-        self.label2cat = {i: name for i, name in enumerate(classes)}
-        self.points_loader = mmcv.build_from_cfg(points_loader, PIPELINES)
-        self.file_client = mmcv.FileClient(**file_client_args)
-
-        # load data base infos
-        if hasattr(self.file_client, 'get_local_path'):
-            with self.file_client.get_local_path(info_path) as local_path:
-                # loading data from a file-like object needs file format
-                db_infos = mmcv.load(open(local_path, 'rb'), file_format='pkl')
-        else:
-            warnings.warn(
-                'The used MMCV version does not have get_local_path. '
-                f'We treat the {info_path} as local paths and it '
-                'might cause errors if the path is not a local path. '
-                'Please use MMCV>= 1.3.16 if you meet errors.')
-            db_infos = mmcv.load(info_path)
-
-        # filter database infos
-        from mmdet3d.utils import get_root_logger
-        logger = get_root_logger()
-        for k, v in db_infos.items():
-            logger.info(f'load {len(v)} {k} database infos')
-        for prep_func, val in prepare.items():
-            db_infos = getattr(self, prep_func)(db_infos, val)
-        logger.info('After filter database:')
-        for k, v in db_infos.items():
-            logger.info(f'load {len(v)} {k} database infos')
-
-        self.db_infos = db_infos
-
-        # load sample groups
-        # TODO: more elegant way to load sample groups
-        self.sample_groups = []
-        for name, num in sample_groups.items():
-            self.sample_groups.append({name: int(num)})
-
-        self.group_db_infos = self.db_infos  # just use db_infos
-        self.sample_classes = []
-        self.sample_max_nums = []
-        for group_info in self.sample_groups:
-            self.sample_classes += list(group_info.keys())
-            self.sample_max_nums += list(group_info.values())
-
-        self.sampler_dict = {}
-        for k, v in self.group_db_infos.items():
-            self.sampler_dict[k] = BatchSampler(v, k, shuffle=True)
-        # TODO: No group_sampling currently
-
-    @staticmethod
-    def filter_by_difficulty(db_infos, removed_difficulty):
-        """Filter ground truths by difficulties.
-
-        Args:
-            db_infos (dict): Info of groundtruth database.
-            removed_difficulty (list): Difficulties that are not qualified.
-
-        Returns:
-            dict: Info of database after filtering.
-        """
-        new_db_infos = {}
-        for key, dinfos in db_infos.items():
-            new_db_infos[key] = [
-                info for info in dinfos
-                if info['difficulty'] not in removed_difficulty
-            ]
-        return new_db_infos
-
-    @staticmethod
-    def filter_by_min_points(db_infos, min_gt_points_dict):
-        """Filter ground truths by number of points in the bbox.
-
-        Args:
-            db_infos (dict): Info of groundtruth database.
-            min_gt_points_dict (dict): Different number of minimum points
-                needed for different categories of ground truths.
-
-        Returns:
-            dict: Info of database after filtering.
-        """
-        for name, min_num in min_gt_points_dict.items():
-            min_num = int(min_num)
-            if min_num > 0:
-                filtered_infos = []
-                for info in db_infos[name]:
-                    if info['num_points_in_gt'] >= min_num:
-                        filtered_infos.append(info)
-                db_infos[name] = filtered_infos
-        return db_infos
-
-    def sample_all(self, gt_bboxes, gt_labels, img=None, ground_plane=None):
-        """Sampling all categories of bboxes.
-
-        Args:
-            gt_bboxes (np.ndarray): Ground truth bounding boxes.
-            gt_labels (np.ndarray): Ground truth labels of boxes.
-
-        Returns:
-            dict: Dict of sampled 'pseudo ground truths'.
-
-                - gt_labels_3d (np.ndarray): ground truths labels
-                    of sampled objects.
-                - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`):
-                    sampled ground truth 3D bounding boxes
-                - points (np.ndarray): sampled points
-                - group_ids (np.ndarray): ids of sampled ground truths
-        """
-        sampled_num_dict = {}
-        sample_num_per_class = []
-        for class_name, max_sample_num in zip(self.sample_classes,
-                                              self.sample_max_nums):
-            class_label = self.cat2label[class_name]
-            # sampled_num = int(max_sample_num -
-            #                   np.sum([n == class_name for n in gt_names]))
-            sampled_num = int(max_sample_num -
-                              np.sum([n == class_label for n in gt_labels]))
-            sampled_num = np.round(self.rate * sampled_num).astype(np.int64)
-            sampled_num_dict[class_name] = sampled_num
-            sample_num_per_class.append(sampled_num)
-
-        sampled = []
-        sampled_gt_bboxes = []
-        avoid_coll_boxes = gt_bboxes
-
-        for class_name, sampled_num in zip(self.sample_classes,
-                                           sample_num_per_class):
-            if sampled_num > 0:
-                sampled_cls = self.sample_class_v2(class_name, sampled_num,
-                                                   avoid_coll_boxes)
-
-                sampled += sampled_cls
-                if len(sampled_cls) > 0:
-                    if len(sampled_cls) == 1:
-                        sampled_gt_box = sampled_cls[0]['box3d_lidar'][
-                            np.newaxis, ...]
-                    else:
-                        sampled_gt_box = np.stack(
-                            [s['box3d_lidar'] for s in sampled_cls], axis=0)
-
-                    sampled_gt_bboxes += [sampled_gt_box]
-                    avoid_coll_boxes = np.concatenate(
-                        [avoid_coll_boxes, sampled_gt_box], axis=0)
-
-        ret = None
-        if len(sampled) > 0:
-            sampled_gt_bboxes = np.concatenate(sampled_gt_bboxes, axis=0)
-            # center = sampled_gt_bboxes[:, 0:3]
-
-            # num_sampled = len(sampled)
-            s_points_list = []
-            count = 0
-            for info in sampled:
-                file_path = os.path.join(
-                    self.data_root,
-                    info['path']) if self.data_root else info['path']
-                results = dict(pts_filename=file_path)
-                s_points = self.points_loader(results)['points']
-                s_points.translate(info['box3d_lidar'][:3])
-
-                count += 1
-
-                s_points_list.append(s_points)
-
-            gt_labels = np.array([self.cat2label[s['name']] for s in sampled],
-                                 dtype=np.long)
-
-            if ground_plane is not None:
-                xyz = sampled_gt_bboxes[:, :3]
-                dz = (ground_plane[:3][None, :] *
-                      xyz).sum(-1) + ground_plane[3]
-                sampled_gt_bboxes[:, 2] -= dz
-                for i, s_points in enumerate(s_points_list):
-                    s_points.tensor[:, 2].sub_(dz[i])
-
-            ret = {
-                'gt_labels_3d':
-                gt_labels,
-                'gt_bboxes_3d':
-                sampled_gt_bboxes,
-                'points':
-                s_points_list[0].cat(s_points_list),
-                'group_ids':
-                np.arange(gt_bboxes.shape[0],
-                          gt_bboxes.shape[0] + len(sampled))
-            }
-
-        return ret
-
-    def sample_class_v2(self, name, num, gt_bboxes):
-        """Sampling specific categories of bounding boxes.
-
-        Args:
-            name (str): Class of objects to be sampled.
-            num (int): Number of sampled bboxes.
-            gt_bboxes (np.ndarray): Ground truth boxes.
-
-        Returns:
-            list[dict]: Valid samples after collision test.
-        """
-        sampled = self.sampler_dict[name].sample(num)
-        sampled = copy.deepcopy(sampled)
-        num_gt = gt_bboxes.shape[0]
-        num_sampled = len(sampled)
-        gt_bboxes_bv = box_np_ops.center_to_corner_box2d(
-            gt_bboxes[:, 0:2], gt_bboxes[:, 3:5], gt_bboxes[:, 6])
-
-        sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0)
-        boxes = np.concatenate([gt_bboxes, sp_boxes], axis=0).copy()
-
-        sp_boxes_new = boxes[gt_bboxes.shape[0]:]
-        sp_boxes_bv = box_np_ops.center_to_corner_box2d(
-            sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, 6])
-
-        total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0)
-        coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv)
-        diag = np.arange(total_bv.shape[0])
-        coll_mat[diag, diag] = False
-
-        valid_samples = []
-        for i in range(num_gt, num_gt + num_sampled):
-            if coll_mat[i].any():
-                coll_mat[i] = False
-                coll_mat[:, i] = False
-            else:
-                valid_samples.append(sampled[i - num_gt])
-        return valid_samples
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os
+import warnings
+
+import mmcv
+import numpy as np
+
+from mmdet3d.core.bbox import box_np_ops
+from mmdet3d.datasets.pipelines import data_augment_utils
+from ..builder import OBJECTSAMPLERS, PIPELINES
+
+
+class BatchSampler:
+    """Class for sampling specific category of ground truths.
+
+    Args:
+        sample_list (list[dict]): List of samples.
+        name (str, optional): The category of samples. Default: None.
+        epoch (int, optional): Sampling epoch. Default: None.
+        shuffle (bool, optional): Whether to shuffle indices. Default: False.
+        drop_reminder (bool, optional): Drop reminder. Default: False.
+    """
+
+    def __init__(self,
+                 sampled_list,
+                 name=None,
+                 epoch=None,
+                 shuffle=True,
+                 drop_reminder=False):
+        self._sampled_list = sampled_list
+        self._indices = np.arange(len(sampled_list))
+        if shuffle:
+            np.random.shuffle(self._indices)
+        self._idx = 0
+        self._example_num = len(sampled_list)
+        self._name = name
+        self._shuffle = shuffle
+        self._epoch = epoch
+        self._epoch_counter = 0
+        self._drop_reminder = drop_reminder
+
+    def _sample(self, num):
+        """Sample specific number of ground truths and return indices.
+
+        Args:
+            num (int): Sampled number.
+
+        Returns:
+            list[int]: Indices of sampled ground truths.
+        """
+        if self._idx + num >= self._example_num:
+            ret = self._indices[self._idx:].copy()
+            self._reset()
+        else:
+            ret = self._indices[self._idx:self._idx + num]
+            self._idx += num
+        return ret
+
+    def _reset(self):
+        """Reset the index of batchsampler to zero."""
+        assert self._name is not None
+        # print("reset", self._name)
+        if self._shuffle:
+            np.random.shuffle(self._indices)
+        self._idx = 0
+
+    def sample(self, num):
+        """Sample specific number of ground truths.
+
+        Args:
+            num (int): Sampled number.
+
+        Returns:
+            list[dict]: Sampled ground truths.
+        """
+        indices = self._sample(num)
+        return [self._sampled_list[i] for i in indices]
+
+
+@OBJECTSAMPLERS.register_module()
+class DataBaseSampler(object):
+    """Class for sampling data from the ground truth database.
+
+    Args:
+        info_path (str): Path of groundtruth database info.
+        data_root (str): Path of groundtruth database.
+        rate (float): Rate of actual sampled over maximum sampled number.
+        prepare (dict): Name of preparation functions and the input value.
+        sample_groups (dict): Sampled classes and numbers.
+        classes (list[str], optional): List of classes. Default: None.
+        points_loader(dict, optional): Config of points loader. Default:
+            dict(type='LoadPointsFromFile', load_dim=4, use_dim=[0,1,2,3])
+    """
+
+    def __init__(self,
+                 info_path,
+                 data_root,
+                 rate,
+                 prepare,
+                 sample_groups,
+                 classes=None,
+                 points_loader=dict(
+                     type='LoadPointsFromFile',
+                     coord_type='LIDAR',
+                     load_dim=4,
+                     use_dim=[0, 1, 2, 3]),
+                 file_client_args=dict(backend='disk')):
+        super().__init__()
+        self.data_root = data_root
+        self.info_path = info_path
+        self.rate = rate
+        self.prepare = prepare
+        self.classes = classes
+        self.cat2label = {name: i for i, name in enumerate(classes)}
+        self.label2cat = {i: name for i, name in enumerate(classes)}
+        self.points_loader = mmcv.build_from_cfg(points_loader, PIPELINES)
+        self.file_client = mmcv.FileClient(**file_client_args)
+
+        # load data base infos
+        if hasattr(self.file_client, 'get_local_path'):
+            with self.file_client.get_local_path(info_path) as local_path:
+                # loading data from a file-like object needs file format
+                db_infos = mmcv.load(open(local_path, 'rb'), file_format='pkl')
+        else:
+            warnings.warn(
+                'The used MMCV version does not have get_local_path. '
+                f'We treat the {info_path} as local paths and it '
+                'might cause errors if the path is not a local path. '
+                'Please use MMCV>= 1.3.16 if you meet errors.')
+            db_infos = mmcv.load(info_path)
+
+        # filter database infos
+        from mmdet3d.utils import get_root_logger
+        logger = get_root_logger()
+        for k, v in db_infos.items():
+            logger.info(f'load {len(v)} {k} database infos')
+        for prep_func, val in prepare.items():
+            db_infos = getattr(self, prep_func)(db_infos, val)
+        logger.info('After filter database:')
+        for k, v in db_infos.items():
+            logger.info(f'load {len(v)} {k} database infos')
+
+        self.db_infos = db_infos
+
+        # load sample groups
+        # TODO: more elegant way to load sample groups
+        self.sample_groups = []
+        for name, num in sample_groups.items():
+            self.sample_groups.append({name: int(num)})
+
+        self.group_db_infos = self.db_infos  # just use db_infos
+        self.sample_classes = []
+        self.sample_max_nums = []
+        for group_info in self.sample_groups:
+            self.sample_classes += list(group_info.keys())
+            self.sample_max_nums += list(group_info.values())
+
+        self.sampler_dict = {}
+        for k, v in self.group_db_infos.items():
+            self.sampler_dict[k] = BatchSampler(v, k, shuffle=True)
+        # TODO: No group_sampling currently
+
+    @staticmethod
+    def filter_by_difficulty(db_infos, removed_difficulty):
+        """Filter ground truths by difficulties.
+
+        Args:
+            db_infos (dict): Info of groundtruth database.
+            removed_difficulty (list): Difficulties that are not qualified.
+
+        Returns:
+            dict: Info of database after filtering.
+        """
+        new_db_infos = {}
+        for key, dinfos in db_infos.items():
+            new_db_infos[key] = [
+                info for info in dinfos
+                if info['difficulty'] not in removed_difficulty
+            ]
+        return new_db_infos
+
+    @staticmethod
+    def filter_by_min_points(db_infos, min_gt_points_dict):
+        """Filter ground truths by number of points in the bbox.
+
+        Args:
+            db_infos (dict): Info of groundtruth database.
+            min_gt_points_dict (dict): Different number of minimum points
+                needed for different categories of ground truths.
+
+        Returns:
+            dict: Info of database after filtering.
+        """
+        for name, min_num in min_gt_points_dict.items():
+            min_num = int(min_num)
+            if min_num > 0:
+                filtered_infos = []
+                for info in db_infos[name]:
+                    if info['num_points_in_gt'] >= min_num:
+                        filtered_infos.append(info)
+                db_infos[name] = filtered_infos
+        return db_infos
+
+    def sample_all(self, gt_bboxes, gt_labels, img=None, ground_plane=None):
+        """Sampling all categories of bboxes.
+
+        Args:
+            gt_bboxes (np.ndarray): Ground truth bounding boxes.
+            gt_labels (np.ndarray): Ground truth labels of boxes.
+
+        Returns:
+            dict: Dict of sampled 'pseudo ground truths'.
+
+                - gt_labels_3d (np.ndarray): ground truths labels
+                    of sampled objects.
+                - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`):
+                    sampled ground truth 3D bounding boxes
+                - points (np.ndarray): sampled points
+                - group_ids (np.ndarray): ids of sampled ground truths
+        """
+        sampled_num_dict = {}
+        sample_num_per_class = []
+        for class_name, max_sample_num in zip(self.sample_classes,
+                                              self.sample_max_nums):
+            class_label = self.cat2label[class_name]
+            # sampled_num = int(max_sample_num -
+            #                   np.sum([n == class_name for n in gt_names]))
+            sampled_num = int(max_sample_num -
+                              np.sum([n == class_label for n in gt_labels]))
+            sampled_num = np.round(self.rate * sampled_num).astype(np.int64)
+            sampled_num_dict[class_name] = sampled_num
+            sample_num_per_class.append(sampled_num)
+
+        sampled = []
+        sampled_gt_bboxes = []
+        avoid_coll_boxes = gt_bboxes
+
+        for class_name, sampled_num in zip(self.sample_classes,
+                                           sample_num_per_class):
+            if sampled_num > 0:
+                sampled_cls = self.sample_class_v2(class_name, sampled_num,
+                                                   avoid_coll_boxes)
+
+                sampled += sampled_cls
+                if len(sampled_cls) > 0:
+                    if len(sampled_cls) == 1:
+                        sampled_gt_box = sampled_cls[0]['box3d_lidar'][
+                            np.newaxis, ...]
+                    else:
+                        sampled_gt_box = np.stack(
+                            [s['box3d_lidar'] for s in sampled_cls], axis=0)
+
+                    sampled_gt_bboxes += [sampled_gt_box]
+                    avoid_coll_boxes = np.concatenate(
+                        [avoid_coll_boxes, sampled_gt_box], axis=0)
+
+        ret = None
+        if len(sampled) > 0:
+            sampled_gt_bboxes = np.concatenate(sampled_gt_bboxes, axis=0)
+            # center = sampled_gt_bboxes[:, 0:3]
+
+            # num_sampled = len(sampled)
+            s_points_list = []
+            count = 0
+            for info in sampled:
+                file_path = os.path.join(
+                    self.data_root,
+                    info['path']) if self.data_root else info['path']
+                results = dict(pts_filename=file_path)
+                s_points = self.points_loader(results)['points']
+                s_points.translate(info['box3d_lidar'][:3])
+
+                count += 1
+
+                s_points_list.append(s_points)
+
+            gt_labels = np.array([self.cat2label[s['name']] for s in sampled],
+                                 dtype=np.long)
+
+            if ground_plane is not None:
+                xyz = sampled_gt_bboxes[:, :3]
+                dz = (ground_plane[:3][None, :] *
+                      xyz).sum(-1) + ground_plane[3]
+                sampled_gt_bboxes[:, 2] -= dz
+                for i, s_points in enumerate(s_points_list):
+                    s_points.tensor[:, 2].sub_(dz[i])
+
+            ret = {
+                'gt_labels_3d':
+                gt_labels,
+                'gt_bboxes_3d':
+                sampled_gt_bboxes,
+                'points':
+                s_points_list[0].cat(s_points_list),
+                'group_ids':
+                np.arange(gt_bboxes.shape[0],
+                          gt_bboxes.shape[0] + len(sampled))
+            }
+
+        return ret
+
+    def sample_class_v2(self, name, num, gt_bboxes):
+        """Sampling specific categories of bounding boxes.
+
+        Args:
+            name (str): Class of objects to be sampled.
+            num (int): Number of sampled bboxes.
+            gt_bboxes (np.ndarray): Ground truth boxes.
+
+        Returns:
+            list[dict]: Valid samples after collision test.
+        """
+        sampled = self.sampler_dict[name].sample(num)
+        sampled = copy.deepcopy(sampled)
+        num_gt = gt_bboxes.shape[0]
+        num_sampled = len(sampled)
+        gt_bboxes_bv = box_np_ops.center_to_corner_box2d(
+            gt_bboxes[:, 0:2], gt_bboxes[:, 3:5], gt_bboxes[:, 6])
+
+        sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0)
+        boxes = np.concatenate([gt_bboxes, sp_boxes], axis=0).copy()
+
+        sp_boxes_new = boxes[gt_bboxes.shape[0]:]
+        sp_boxes_bv = box_np_ops.center_to_corner_box2d(
+            sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, 6])
+
+        total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0)
+        coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv)
+        diag = np.arange(total_bv.shape[0])
+        coll_mat[diag, diag] = False
+
+        valid_samples = []
+        for i in range(num_gt, num_gt + num_sampled):
+            if coll_mat[i].any():
+                coll_mat[i] = False
+                coll_mat[:, i] = False
+            else:
+                valid_samples.append(sampled[i - num_gt])
+        return valid_samples
diff --git a/mmdet3d/datasets/pipelines/formating.py b/mmdet3d/datasets/pipelines/formating.py
index 94a62e6..c57b528 100644
--- a/mmdet3d/datasets/pipelines/formating.py
+++ b/mmdet3d/datasets/pipelines/formating.py
@@ -1,266 +1,266 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-from mmcv.parallel import DataContainer as DC
-
-from mmdet3d.core.bbox import BaseInstance3DBoxes
-from mmdet3d.core.points import BasePoints
-from mmdet.datasets.pipelines import to_tensor
-from ..builder import PIPELINES
-
-
-@PIPELINES.register_module()
-class DefaultFormatBundle(object):
-    """Default formatting bundle.
-
-    It simplifies the pipeline of formatting common fields, including "img",
-    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
-    These fields are formatted as follows.
-
-    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
-    - proposals: (1)to tensor, (2)to DataContainer
-    - gt_bboxes: (1)to tensor, (2)to DataContainer
-    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
-    - gt_labels: (1)to tensor, (2)to DataContainer
-    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
-    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor,
-                       (3)to DataContainer (stack=True)
-    """
-
-    def __init__(self, ):
-        return
-
-    def __call__(self, results):
-        """Call function to transform and format common fields in results.
-
-        Args:
-            results (dict): Result dict contains the data to convert.
-
-        Returns:
-            dict: The result dict contains the data that is formatted with
-                default bundle.
-        """
-        if 'img' in results:
-            if isinstance(results['img'], list):
-                # process multiple imgs in single frame
-                imgs = [img.transpose(2, 0, 1) for img in results['img']]
-                imgs = np.ascontiguousarray(np.stack(imgs, axis=0))
-                results['img'] = DC(to_tensor(imgs), stack=True)
-            else:
-                img = np.ascontiguousarray(results['img'].transpose(2, 0, 1))
-                results['img'] = DC(to_tensor(img), stack=True)
-        for key in [
-                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
-                'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
-                'pts_semantic_mask', 'centers2d', 'depths'
-        ]:
-            if key not in results:
-                continue
-            if isinstance(results[key], list):
-                results[key] = DC([to_tensor(res) for res in results[key]])
-            else:
-                results[key] = DC(to_tensor(results[key]))
-        if 'gt_bboxes_3d' in results:
-            if isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
-                results['gt_bboxes_3d'] = DC(
-                    results['gt_bboxes_3d'], cpu_only=True)
-            else:
-                results['gt_bboxes_3d'] = DC(
-                    to_tensor(results['gt_bboxes_3d']))
-
-        if 'gt_masks' in results:
-            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
-        if 'gt_semantic_seg' in results:
-            results['gt_semantic_seg'] = DC(
-                to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
-
-        return results
-
-    def __repr__(self):
-        return self.__class__.__name__
-
-
-@PIPELINES.register_module()
-class Collect3D(object):
-    """Collect data from the loader relevant to the specific task.
-
-    This is usually the last stage of the data loader pipeline. Typically keys
-    is set to some subset of "img", "proposals", "gt_bboxes",
-    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
-
-    The "img_meta" item is always populated.  The contents of the "img_meta"
-    dictionary depends on "meta_keys". By default this includes:
-
-        - 'img_shape': shape of the image input to the network as a tuple
-            (h, w, c).  Note that images may be zero padded on the
-            bottom/right if the batch tensor is larger than this shape.
-        - 'scale_factor': a float indicating the preprocessing scale
-        - 'flip': a boolean indicating if image flip transform was used
-        - 'filename': path to the image file
-        - 'ori_shape': original shape of the image as a tuple (h, w, c)
-        - 'pad_shape': image shape after padding
-        - 'lidar2img': transform from lidar to image
-        - 'depth2img': transform from depth to image
-        - 'cam2img': transform from camera to image
-        - 'pcd_horizontal_flip': a boolean indicating if point cloud is
-            flipped horizontally
-        - 'pcd_vertical_flip': a boolean indicating if point cloud is
-            flipped vertically
-        - 'box_mode_3d': 3D box mode
-        - 'box_type_3d': 3D box type
-        - 'img_norm_cfg': a dict of normalization information:
-            - mean: per channel mean subtraction
-            - std: per channel std divisor
-            - to_rgb: bool indicating if bgr was converted to rgb
-        - 'pcd_trans': point cloud transformations
-        - 'sample_idx': sample index
-        - 'pcd_scale_factor': point cloud scale factor
-        - 'pcd_rotation': rotation applied to point cloud
-        - 'pts_filename': path to point cloud file.
-
-    Args:
-        keys (Sequence[str]): Keys of results to be collected in ``data``.
-        meta_keys (Sequence[str], optional): Meta keys to be converted to
-            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
-            Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
-            'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
-            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
-            'box_type_3d', 'img_norm_cfg', 'pcd_trans',
-            'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
-    """
-
-    def __init__(
-        self,
-        keys,
-        meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
-                   'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
-                   'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
-                   'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx',
-                   'pcd_scale_factor', 'pcd_rotation', 'pcd_rotation_angle',
-                   'pts_filename', 'transformation_3d_flow', 'trans_mat',
-                   'affine_aug')):
-        self.keys = keys
-        self.meta_keys = meta_keys
-
-    def __call__(self, results):
-        """Call function to collect keys in results. The keys in ``meta_keys``
-        will be converted to :obj:`mmcv.DataContainer`.
-
-        Args:
-            results (dict): Result dict contains the data to collect.
-
-        Returns:
-            dict: The result dict contains the following keys
-                - keys in ``self.keys``
-                - ``img_metas``
-        """
-        data = {}
-        img_metas = {}
-        for key in self.meta_keys:
-            if key in results:
-                img_metas[key] = results[key]
-
-        data['img_metas'] = DC(img_metas, cpu_only=True)
-        for key in self.keys:
-            data[key] = results[key]
-        return data
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        return self.__class__.__name__ + \
-            f'(keys={self.keys}, meta_keys={self.meta_keys})'
-
-
-@PIPELINES.register_module()
-class DefaultFormatBundle3D(DefaultFormatBundle):
-    """Default formatting bundle.
-
-    It simplifies the pipeline of formatting common fields for voxels,
-    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
-    "gt_semantic_seg".
-    These fields are formatted as follows.
-
-    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
-    - proposals: (1)to tensor, (2)to DataContainer
-    - gt_bboxes: (1)to tensor, (2)to DataContainer
-    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
-    - gt_labels: (1)to tensor, (2)to DataContainer
-    """
-
-    def __init__(self, class_names, with_gt=True, with_label=True):
-        super(DefaultFormatBundle3D, self).__init__()
-        self.class_names = class_names
-        self.with_gt = with_gt
-        self.with_label = with_label
-
-    def __call__(self, results):
-        """Call function to transform and format common fields in results.
-
-        Args:
-            results (dict): Result dict contains the data to convert.
-
-        Returns:
-            dict: The result dict contains the data that is formatted with
-                default bundle.
-        """
-        # Format 3D data
-        if 'points' in results:
-            assert isinstance(results['points'], BasePoints)
-            results['points'] = DC(results['points'].tensor)
-
-        for key in ['voxels', 'coors', 'voxel_centers', 'num_points']:
-            if key not in results:
-                continue
-            results[key] = DC(to_tensor(results[key]), stack=False)
-
-        if self.with_gt:
-            # Clean GT bboxes in the final
-            if 'gt_bboxes_3d_mask' in results:
-                gt_bboxes_3d_mask = results['gt_bboxes_3d_mask']
-                results['gt_bboxes_3d'] = results['gt_bboxes_3d'][
-                    gt_bboxes_3d_mask]
-                if 'gt_names_3d' in results:
-                    results['gt_names_3d'] = results['gt_names_3d'][
-                        gt_bboxes_3d_mask]
-                if 'centers2d' in results:
-                    results['centers2d'] = results['centers2d'][
-                        gt_bboxes_3d_mask]
-                if 'depths' in results:
-                    results['depths'] = results['depths'][gt_bboxes_3d_mask]
-            if 'gt_bboxes_mask' in results:
-                gt_bboxes_mask = results['gt_bboxes_mask']
-                if 'gt_bboxes' in results:
-                    results['gt_bboxes'] = results['gt_bboxes'][gt_bboxes_mask]
-                results['gt_names'] = results['gt_names'][gt_bboxes_mask]
-            if self.with_label:
-                if 'gt_names' in results and len(results['gt_names']) == 0:
-                    results['gt_labels'] = np.array([], dtype=np.int64)
-                    results['attr_labels'] = np.array([], dtype=np.int64)
-                elif 'gt_names' in results and isinstance(
-                        results['gt_names'][0], list):
-                    # gt_labels might be a list of list in multi-view setting
-                    results['gt_labels'] = [
-                        np.array([self.class_names.index(n) for n in res],
-                                 dtype=np.int64) for res in results['gt_names']
-                    ]
-                elif 'gt_names' in results:
-                    results['gt_labels'] = np.array([
-                        self.class_names.index(n) for n in results['gt_names']
-                    ],
-                                                    dtype=np.int64)
-                # we still assume one pipeline for one frame LiDAR
-                # thus, the 3D name is list[string]
-                if 'gt_names_3d' in results:
-                    results['gt_labels_3d'] = np.array([
-                        self.class_names.index(n)
-                        for n in results['gt_names_3d']
-                    ],
-                                                       dtype=np.int64)
-        results = super(DefaultFormatBundle3D, self).__call__(results)
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(class_names={self.class_names}, '
-        repr_str += f'with_gt={self.with_gt}, with_label={self.with_label})'
-        return repr_str
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.parallel import DataContainer as DC
+
+from mmdet3d.core.bbox import BaseInstance3DBoxes
+from mmdet3d.core.points import BasePoints
+from mmdet.datasets.pipelines import to_tensor
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle(object):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor,
+                       (3)to DataContainer (stack=True)
+    """
+
+    def __init__(self, ):
+        return
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        if 'img' in results:
+            if isinstance(results['img'], list):
+                # process multiple imgs in single frame
+                imgs = [img.transpose(2, 0, 1) for img in results['img']]
+                imgs = np.ascontiguousarray(np.stack(imgs, axis=0))
+                results['img'] = DC(to_tensor(imgs), stack=True)
+            else:
+                img = np.ascontiguousarray(results['img'].transpose(2, 0, 1))
+                results['img'] = DC(to_tensor(img), stack=True)
+        for key in [
+                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
+                'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
+                'pts_semantic_mask', 'centers2d', 'depths'
+        ]:
+            if key not in results:
+                continue
+            if isinstance(results[key], list):
+                results[key] = DC([to_tensor(res) for res in results[key]])
+            else:
+                results[key] = DC(to_tensor(results[key]))
+        if 'gt_bboxes_3d' in results:
+            if isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
+                results['gt_bboxes_3d'] = DC(
+                    results['gt_bboxes_3d'], cpu_only=True)
+            else:
+                results['gt_bboxes_3d'] = DC(
+                    to_tensor(results['gt_bboxes_3d']))
+
+        if 'gt_masks' in results:
+            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = DC(
+                to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class Collect3D(object):
+    """Collect data from the loader relevant to the specific task.
+
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img", "proposals", "gt_bboxes",
+    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+
+    The "img_meta" item is always populated.  The contents of the "img_meta"
+    dictionary depends on "meta_keys". By default this includes:
+
+        - 'img_shape': shape of the image input to the network as a tuple
+            (h, w, c).  Note that images may be zero padded on the
+            bottom/right if the batch tensor is larger than this shape.
+        - 'scale_factor': a float indicating the preprocessing scale
+        - 'flip': a boolean indicating if image flip transform was used
+        - 'filename': path to the image file
+        - 'ori_shape': original shape of the image as a tuple (h, w, c)
+        - 'pad_shape': image shape after padding
+        - 'lidar2img': transform from lidar to image
+        - 'depth2img': transform from depth to image
+        - 'cam2img': transform from camera to image
+        - 'pcd_horizontal_flip': a boolean indicating if point cloud is
+            flipped horizontally
+        - 'pcd_vertical_flip': a boolean indicating if point cloud is
+            flipped vertically
+        - 'box_mode_3d': 3D box mode
+        - 'box_type_3d': 3D box type
+        - 'img_norm_cfg': a dict of normalization information:
+            - mean: per channel mean subtraction
+            - std: per channel std divisor
+            - to_rgb: bool indicating if bgr was converted to rgb
+        - 'pcd_trans': point cloud transformations
+        - 'sample_idx': sample index
+        - 'pcd_scale_factor': point cloud scale factor
+        - 'pcd_rotation': rotation applied to point cloud
+        - 'pts_filename': path to point cloud file.
+
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
+            'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
+            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
+            'box_type_3d', 'img_norm_cfg', 'pcd_trans',
+            'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
+    """
+
+    def __init__(
+        self,
+        keys,
+        meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
+                   'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
+                   'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
+                   'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx',
+                   'pcd_scale_factor', 'pcd_rotation', 'pcd_rotation_angle',
+                   'pts_filename', 'transformation_3d_flow', 'trans_mat',
+                   'affine_aug')):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        """Call function to collect keys in results. The keys in ``meta_keys``
+        will be converted to :obj:`mmcv.DataContainer`.
+
+        Args:
+            results (dict): Result dict contains the data to collect.
+
+        Returns:
+            dict: The result dict contains the following keys
+                - keys in ``self.keys``
+                - ``img_metas``
+        """
+        data = {}
+        img_metas = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_metas[key] = results[key]
+
+        data['img_metas'] = DC(img_metas, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle3D(DefaultFormatBundle):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields for voxels,
+    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+    "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    """
+
+    def __init__(self, class_names, with_gt=True, with_label=True):
+        super(DefaultFormatBundle3D, self).__init__()
+        self.class_names = class_names
+        self.with_gt = with_gt
+        self.with_label = with_label
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        # Format 3D data
+        if 'points' in results:
+            assert isinstance(results['points'], BasePoints)
+            results['points'] = DC(results['points'].tensor)
+
+        for key in ['voxels', 'coors', 'voxel_centers', 'num_points']:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]), stack=False)
+
+        if self.with_gt:
+            # Clean GT bboxes in the final
+            if 'gt_bboxes_3d_mask' in results:
+                gt_bboxes_3d_mask = results['gt_bboxes_3d_mask']
+                results['gt_bboxes_3d'] = results['gt_bboxes_3d'][
+                    gt_bboxes_3d_mask]
+                if 'gt_names_3d' in results:
+                    results['gt_names_3d'] = results['gt_names_3d'][
+                        gt_bboxes_3d_mask]
+                if 'centers2d' in results:
+                    results['centers2d'] = results['centers2d'][
+                        gt_bboxes_3d_mask]
+                if 'depths' in results:
+                    results['depths'] = results['depths'][gt_bboxes_3d_mask]
+            if 'gt_bboxes_mask' in results:
+                gt_bboxes_mask = results['gt_bboxes_mask']
+                if 'gt_bboxes' in results:
+                    results['gt_bboxes'] = results['gt_bboxes'][gt_bboxes_mask]
+                results['gt_names'] = results['gt_names'][gt_bboxes_mask]
+            if self.with_label:
+                if 'gt_names' in results and len(results['gt_names']) == 0:
+                    results['gt_labels'] = np.array([], dtype=np.int64)
+                    results['attr_labels'] = np.array([], dtype=np.int64)
+                elif 'gt_names' in results and isinstance(
+                        results['gt_names'][0], list):
+                    # gt_labels might be a list of list in multi-view setting
+                    results['gt_labels'] = [
+                        np.array([self.class_names.index(n) for n in res],
+                                 dtype=np.int64) for res in results['gt_names']
+                    ]
+                elif 'gt_names' in results:
+                    results['gt_labels'] = np.array([
+                        self.class_names.index(n) for n in results['gt_names']
+                    ],
+                                                    dtype=np.int64)
+                # we still assume one pipeline for one frame LiDAR
+                # thus, the 3D name is list[string]
+                if 'gt_names_3d' in results:
+                    results['gt_labels_3d'] = np.array([
+                        self.class_names.index(n)
+                        for n in results['gt_names_3d']
+                    ],
+                                                       dtype=np.int64)
+        results = super(DefaultFormatBundle3D, self).__call__(results)
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(class_names={self.class_names}, '
+        repr_str += f'with_gt={self.with_gt}, with_label={self.with_label})'
+        return repr_str
diff --git a/mmdet3d/datasets/pipelines/loading.py b/mmdet3d/datasets/pipelines/loading.py
index bbdcb8e..3ad10a9 100644
--- a/mmdet3d/datasets/pipelines/loading.py
+++ b/mmdet3d/datasets/pipelines/loading.py
@@ -1,685 +1,685 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
-import numpy as np
-
-from mmdet3d.core.points import BasePoints, get_points_type
-from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile
-from ..builder import PIPELINES
-
-
-@PIPELINES.register_module()
-class LoadMultiViewImageFromFiles(object):
-    """Load multi channel images from a list of separate channel files.
-
-    Expects results['img_filename'] to be a list of filenames.
-
-    Args:
-        to_float32 (bool, optional): Whether to convert the img to float32.
-            Defaults to False.
-        color_type (str, optional): Color type of the file.
-            Defaults to 'unchanged'.
-    """
-
-    def __init__(self, to_float32=False, color_type='unchanged'):
-        self.to_float32 = to_float32
-        self.color_type = color_type
-
-    def __call__(self, results):
-        """Call function to load multi-view image from files.
-
-        Args:
-            results (dict): Result dict containing multi-view image filenames.
-
-        Returns:
-            dict: The result dict containing the multi-view image data.
-                Added keys and values are described below.
-
-                - filename (str): Multi-view image filenames.
-                - img (np.ndarray): Multi-view image arrays.
-                - img_shape (tuple[int]): Shape of multi-view image arrays.
-                - ori_shape (tuple[int]): Shape of original image arrays.
-                - pad_shape (tuple[int]): Shape of padded image arrays.
-                - scale_factor (float): Scale factor.
-                - img_norm_cfg (dict): Normalization configuration of images.
-        """
-        filename = results['img_filename']
-        # img is of shape (h, w, c, num_views)
-        img = np.stack(
-            [mmcv.imread(name, self.color_type) for name in filename], axis=-1)
-        if self.to_float32:
-            img = img.astype(np.float32)
-        results['filename'] = filename
-        # unravel to list, see `DefaultFormatBundle` in formatting.py
-        # which will transpose each image separately and then stack into array
-        results['img'] = [img[..., i] for i in range(img.shape[-1])]
-        results['img_shape'] = img.shape
-        results['ori_shape'] = img.shape
-        # Set initial values for default meta_keys
-        results['pad_shape'] = img.shape
-        results['scale_factor'] = 1.0
-        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
-        results['img_norm_cfg'] = dict(
-            mean=np.zeros(num_channels, dtype=np.float32),
-            std=np.ones(num_channels, dtype=np.float32),
-            to_rgb=False)
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(to_float32={self.to_float32}, '
-        repr_str += f"color_type='{self.color_type}')"
-        return repr_str
-
-
-@PIPELINES.register_module()
-class LoadImageFromFileMono3D(LoadImageFromFile):
-    """Load an image from file in monocular 3D object detection. Compared to 2D
-    detection, additional camera parameters need to be loaded.
-
-    Args:
-        kwargs (dict): Arguments are the same as those in
-            :class:`LoadImageFromFile`.
-    """
-
-    def __call__(self, results):
-        """Call functions to load image and get image meta information.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
-
-        Returns:
-            dict: The dict contains loaded image and meta information.
-        """
-        super().__call__(results)
-        results['cam2img'] = results['img_info']['cam_intrinsic']
-        return results
-
-
-@PIPELINES.register_module()
-class LoadPointsFromMultiSweeps(object):
-    """Load points from multiple sweeps.
-
-    This is usually used for nuScenes dataset to utilize previous sweeps.
-
-    Args:
-        sweeps_num (int, optional): Number of sweeps. Defaults to 10.
-        load_dim (int, optional): Dimension number of the loaded points.
-            Defaults to 5.
-        use_dim (list[int], optional): Which dimension to use.
-            Defaults to [0, 1, 2, 4].
-        file_client_args (dict, optional): Config dict of file clients,
-            refer to
-            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
-            for more details. Defaults to dict(backend='disk').
-        pad_empty_sweeps (bool, optional): Whether to repeat keyframe when
-            sweeps is empty. Defaults to False.
-        remove_close (bool, optional): Whether to remove close points.
-            Defaults to False.
-        test_mode (bool, optional): If `test_mode=True`, it will not
-            randomly sample sweeps but select the nearest N frames.
-            Defaults to False.
-    """
-
-    def __init__(self,
-                 sweeps_num=10,
-                 load_dim=5,
-                 use_dim=[0, 1, 2, 4],
-                 file_client_args=dict(backend='disk'),
-                 pad_empty_sweeps=False,
-                 remove_close=False,
-                 test_mode=False):
-        self.load_dim = load_dim
-        self.sweeps_num = sweeps_num
-        self.use_dim = use_dim
-        self.file_client_args = file_client_args.copy()
-        self.file_client = None
-        self.pad_empty_sweeps = pad_empty_sweeps
-        self.remove_close = remove_close
-        self.test_mode = test_mode
-
-    def _load_points(self, pts_filename):
-        """Private function to load point clouds data.
-
-        Args:
-            pts_filename (str): Filename of point clouds data.
-
-        Returns:
-            np.ndarray: An array containing point clouds data.
-        """
-        if self.file_client is None:
-            self.file_client = mmcv.FileClient(**self.file_client_args)
-        try:
-            pts_bytes = self.file_client.get(pts_filename)
-            points = np.frombuffer(pts_bytes, dtype=np.float32)
-        except ConnectionError:
-            mmcv.check_file_exist(pts_filename)
-            if pts_filename.endswith('.npy'):
-                points = np.load(pts_filename)
-            else:
-                points = np.fromfile(pts_filename, dtype=np.float32)
-        return points
-
-    def _remove_close(self, points, radius=1.0):
-        """Removes point too close within a certain radius from origin.
-
-        Args:
-            points (np.ndarray | :obj:`BasePoints`): Sweep points.
-            radius (float, optional): Radius below which points are removed.
-                Defaults to 1.0.
-
-        Returns:
-            np.ndarray: Points after removing.
-        """
-        if isinstance(points, np.ndarray):
-            points_numpy = points
-        elif isinstance(points, BasePoints):
-            points_numpy = points.tensor.numpy()
-        else:
-            raise NotImplementedError
-        x_filt = np.abs(points_numpy[:, 0]) < radius
-        y_filt = np.abs(points_numpy[:, 1]) < radius
-        not_close = np.logical_not(np.logical_and(x_filt, y_filt))
-        return points[not_close]
-
-    def __call__(self, results):
-        """Call function to load multi-sweep point clouds from files.
-
-        Args:
-            results (dict): Result dict containing multi-sweep point cloud
-                filenames.
-
-        Returns:
-            dict: The result dict containing the multi-sweep points data.
-                Added key and value are described below.
-
-                - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point
-                    cloud arrays.
-        """
-        points = results['points']
-        points.tensor[:, 4] = 0
-        sweep_points_list = [points]
-        ts = results['timestamp']
-        if self.pad_empty_sweeps and len(results['sweeps']) == 0:
-            for i in range(self.sweeps_num):
-                if self.remove_close:
-                    sweep_points_list.append(self._remove_close(points))
-                else:
-                    sweep_points_list.append(points)
-        else:
-            if len(results['sweeps']) <= self.sweeps_num:
-                choices = np.arange(len(results['sweeps']))
-            elif self.test_mode:
-                choices = np.arange(self.sweeps_num)
-            else:
-                choices = np.random.choice(
-                    len(results['sweeps']), self.sweeps_num, replace=False)
-            for idx in choices:
-                sweep = results['sweeps'][idx]
-                points_sweep = self._load_points(sweep['data_path'])
-                points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim)
-                if self.remove_close:
-                    points_sweep = self._remove_close(points_sweep)
-                sweep_ts = sweep['timestamp'] / 1e6
-                points_sweep[:, :3] = points_sweep[:, :3] @ sweep[
-                    'sensor2lidar_rotation'].T
-                points_sweep[:, :3] += sweep['sensor2lidar_translation']
-                points_sweep[:, 4] = ts - sweep_ts
-                points_sweep = points.new_point(points_sweep)
-                sweep_points_list.append(points_sweep)
-
-        points = points.cat(sweep_points_list)
-        points = points[:, self.use_dim]
-        results['points'] = points
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        return f'{self.__class__.__name__}(sweeps_num={self.sweeps_num})'
-
-
-@PIPELINES.register_module()
-class PointSegClassMapping(object):
-    """Map original semantic class to valid category ids.
-
-    Map valid classes as 0~len(valid_cat_ids)-1 and
-    others as len(valid_cat_ids).
-
-    Args:
-        valid_cat_ids (tuple[int]): A tuple of valid category.
-        max_cat_id (int, optional): The max possible cat_id in input
-            segmentation mask. Defaults to 40.
-    """
-
-    def __init__(self, valid_cat_ids, max_cat_id=40):
-        assert max_cat_id >= np.max(valid_cat_ids), \
-            'max_cat_id should be greater than maximum id in valid_cat_ids'
-
-        self.valid_cat_ids = valid_cat_ids
-        self.max_cat_id = int(max_cat_id)
-
-        # build cat_id to class index mapping
-        neg_cls = len(valid_cat_ids)
-        self.cat_id2class = np.ones(
-            self.max_cat_id + 1, dtype=np.int) * neg_cls
-        for cls_idx, cat_id in enumerate(valid_cat_ids):
-            self.cat_id2class[cat_id] = cls_idx
-
-    def __call__(self, results):
-        """Call function to map original semantic class to valid category ids.
-
-        Args:
-            results (dict): Result dict containing point semantic masks.
-
-        Returns:
-            dict: The result dict containing the mapped category ids.
-                Updated key and value are described below.
-
-                - pts_semantic_mask (np.ndarray): Mapped semantic masks.
-        """
-        assert 'pts_semantic_mask' in results
-        pts_semantic_mask = results['pts_semantic_mask']
-
-        converted_pts_sem_mask = self.cat_id2class[pts_semantic_mask]
-
-        results['pts_semantic_mask'] = converted_pts_sem_mask
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(valid_cat_ids={self.valid_cat_ids}, '
-        repr_str += f'max_cat_id={self.max_cat_id})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class NormalizePointsColor(object):
-    """Normalize color of points.
-
-    Args:
-        color_mean (list[float]): Mean color of the point cloud.
-    """
-
-    def __init__(self, color_mean):
-        self.color_mean = color_mean
-
-    def __call__(self, results):
-        """Call function to normalize color of points.
-
-        Args:
-            results (dict): Result dict containing point clouds data.
-
-        Returns:
-            dict: The result dict containing the normalized points.
-                Updated key and value are described below.
-
-                - points (:obj:`BasePoints`): Points after color normalization.
-        """
-        points = results['points']
-        assert points.attribute_dims is not None and \
-            'color' in points.attribute_dims.keys(), \
-            'Expect points have color attribute'
-        if self.color_mean is not None:
-            points.color = points.color - \
-                points.color.new_tensor(self.color_mean)
-        points.color = points.color / 255.0
-        results['points'] = points
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(color_mean={self.color_mean})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class LoadPointsFromFile(object):
-    """Load Points From File.
-
-    Load points from file.
-
-    Args:
-        coord_type (str): The type of coordinates of points cloud.
-            Available options includes:
-            - 'LIDAR': Points in LiDAR coordinates.
-            - 'DEPTH': Points in depth coordinates, usually for indoor dataset.
-            - 'CAMERA': Points in camera coordinates.
-        load_dim (int, optional): The dimension of the loaded points.
-            Defaults to 6.
-        use_dim (list[int], optional): Which dimensions of the points to use.
-            Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
-            or use_dim=[0, 1, 2, 3] to use the intensity dimension.
-        shift_height (bool, optional): Whether to use shifted height.
-            Defaults to False.
-        use_color (bool, optional): Whether to use color features.
-            Defaults to False.
-        file_client_args (dict, optional): Config dict of file clients,
-            refer to
-            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
-            for more details. Defaults to dict(backend='disk').
-    """
-
-    def __init__(self,
-                 coord_type,
-                 load_dim=6,
-                 use_dim=[0, 1, 2],
-                 shift_height=False,
-                 use_color=False,
-                 file_client_args=dict(backend='disk')):
-        self.shift_height = shift_height
-        self.use_color = use_color
-        if isinstance(use_dim, int):
-            use_dim = list(range(use_dim))
-        assert max(use_dim) < load_dim, \
-            f'Expect all used dimensions < {load_dim}, got {use_dim}'
-        assert coord_type in ['CAMERA', 'LIDAR', 'DEPTH']
-
-        self.coord_type = coord_type
-        self.load_dim = load_dim
-        self.use_dim = use_dim
-        self.file_client_args = file_client_args.copy()
-        self.file_client = None
-
-    def _load_points(self, pts_filename):
-        """Private function to load point clouds data.
-
-        Args:
-            pts_filename (str): Filename of point clouds data.
-
-        Returns:
-            np.ndarray: An array containing point clouds data.
-        """
-        if self.file_client is None:
-            self.file_client = mmcv.FileClient(**self.file_client_args)
-        try:
-            pts_bytes = self.file_client.get(pts_filename)
-            points = np.frombuffer(pts_bytes, dtype=np.float32)
-        except ConnectionError:
-            mmcv.check_file_exist(pts_filename)
-            if pts_filename.endswith('.npy'):
-                points = np.load(pts_filename)
-            else:
-                points = np.fromfile(pts_filename, dtype=np.float32)
-
-        return points
-
-    def __call__(self, results):
-        """Call function to load points data from file.
-
-        Args:
-            results (dict): Result dict containing point clouds data.
-
-        Returns:
-            dict: The result dict containing the point clouds data.
-                Added key and value are described below.
-
-                - points (:obj:`BasePoints`): Point clouds data.
-        """
-        pts_filename = results['pts_filename']
-        points = self._load_points(pts_filename)
-        points = points.reshape(-1, self.load_dim)
-        points = points[:, self.use_dim]
-        attribute_dims = None
-
-        if self.shift_height:
-            floor_height = np.percentile(points[:, 2], 0.99)
-            height = points[:, 2] - floor_height
-            points = np.concatenate(
-                [points[:, :3],
-                 np.expand_dims(height, 1), points[:, 3:]], 1)
-            attribute_dims = dict(height=3)
-
-        if self.use_color:
-            assert len(self.use_dim) >= 6
-            if attribute_dims is None:
-                attribute_dims = dict()
-            attribute_dims.update(
-                dict(color=[
-                    points.shape[1] - 3,
-                    points.shape[1] - 2,
-                    points.shape[1] - 1,
-                ]))
-
-        points_class = get_points_type(self.coord_type)
-        points = points_class(
-            points, points_dim=points.shape[-1], attribute_dims=attribute_dims)
-        results['points'] = points
-
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__ + '('
-        repr_str += f'shift_height={self.shift_height}, '
-        repr_str += f'use_color={self.use_color}, '
-        repr_str += f'file_client_args={self.file_client_args}, '
-        repr_str += f'load_dim={self.load_dim}, '
-        repr_str += f'use_dim={self.use_dim})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class LoadPointsFromDict(LoadPointsFromFile):
-    """Load Points From Dict."""
-
-    def __call__(self, results):
-        assert 'points' in results
-        return results
-
-
-@PIPELINES.register_module()
-class LoadAnnotations3D(LoadAnnotations):
-    """Load Annotations3D.
-
-    Load instance mask and semantic mask of points and
-    encapsulate the items into related fields.
-
-    Args:
-        with_bbox_3d (bool, optional): Whether to load 3D boxes.
-            Defaults to True.
-        with_label_3d (bool, optional): Whether to load 3D labels.
-            Defaults to True.
-        with_attr_label (bool, optional): Whether to load attribute label.
-            Defaults to False.
-        with_mask_3d (bool, optional): Whether to load 3D instance masks.
-            for points. Defaults to False.
-        with_seg_3d (bool, optional): Whether to load 3D semantic masks.
-            for points. Defaults to False.
-        with_bbox (bool, optional): Whether to load 2D boxes.
-            Defaults to False.
-        with_label (bool, optional): Whether to load 2D labels.
-            Defaults to False.
-        with_mask (bool, optional): Whether to load 2D instance masks.
-            Defaults to False.
-        with_seg (bool, optional): Whether to load 2D semantic masks.
-            Defaults to False.
-        with_bbox_depth (bool, optional): Whether to load 2.5D boxes.
-            Defaults to False.
-        poly2mask (bool, optional): Whether to convert polygon annotations
-            to bitmasks. Defaults to True.
-        seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
-            Defaults to int64
-        file_client_args (dict): Config dict of file clients, refer to
-            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
-            for more details.
-    """
-
-    def __init__(self,
-                 with_bbox_3d=True,
-                 with_label_3d=True,
-                 with_attr_label=False,
-                 with_mask_3d=False,
-                 with_seg_3d=False,
-                 with_bbox=False,
-                 with_label=False,
-                 with_mask=False,
-                 with_seg=False,
-                 with_bbox_depth=False,
-                 poly2mask=True,
-                 seg_3d_dtype=np.int64,
-                 file_client_args=dict(backend='disk')):
-        super().__init__(
-            with_bbox,
-            with_label,
-            with_mask,
-            with_seg,
-            poly2mask,
-            file_client_args=file_client_args)
-        self.with_bbox_3d = with_bbox_3d
-        self.with_bbox_depth = with_bbox_depth
-        self.with_label_3d = with_label_3d
-        self.with_attr_label = with_attr_label
-        self.with_mask_3d = with_mask_3d
-        self.with_seg_3d = with_seg_3d
-        self.seg_3d_dtype = seg_3d_dtype
-
-    def _load_bboxes_3d(self, results):
-        """Private function to load 3D bounding box annotations.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
-
-        Returns:
-            dict: The dict containing loaded 3D bounding box annotations.
-        """
-        results['gt_bboxes_3d'] = results['ann_info']['gt_bboxes_3d']
-        results['bbox3d_fields'].append('gt_bboxes_3d')
-        return results
-
-    def _load_bboxes_depth(self, results):
-        """Private function to load 2.5D bounding box annotations.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
-
-        Returns:
-            dict: The dict containing loaded 2.5D bounding box annotations.
-        """
-        results['centers2d'] = results['ann_info']['centers2d']
-        results['depths'] = results['ann_info']['depths']
-        return results
-
-    def _load_labels_3d(self, results):
-        """Private function to load label annotations.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
-
-        Returns:
-            dict: The dict containing loaded label annotations.
-        """
-        results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
-        return results
-
-    def _load_attr_labels(self, results):
-        """Private function to load label annotations.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
-
-        Returns:
-            dict: The dict containing loaded label annotations.
-        """
-        results['attr_labels'] = results['ann_info']['attr_labels']
-        return results
-
-    def _load_masks_3d(self, results):
-        """Private function to load 3D mask annotations.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
-
-        Returns:
-            dict: The dict containing loaded 3D mask annotations.
-        """
-        pts_instance_mask_path = results['ann_info']['pts_instance_mask_path']
-
-        if self.file_client is None:
-            self.file_client = mmcv.FileClient(**self.file_client_args)
-        try:
-            mask_bytes = self.file_client.get(pts_instance_mask_path)
-            pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int64)
-        except ConnectionError:
-            mmcv.check_file_exist(pts_instance_mask_path)
-            pts_instance_mask = np.fromfile(
-                pts_instance_mask_path, dtype=np.int64)
-
-        results['pts_instance_mask'] = pts_instance_mask
-        results['pts_mask_fields'].append('pts_instance_mask')
-        return results
-
-    def _load_semantic_seg_3d(self, results):
-        """Private function to load 3D semantic segmentation annotations.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
-
-        Returns:
-            dict: The dict containing the semantic segmentation annotations.
-        """
-        pts_semantic_mask_path = results['ann_info']['pts_semantic_mask_path']
-
-        if self.file_client is None:
-            self.file_client = mmcv.FileClient(**self.file_client_args)
-        try:
-            mask_bytes = self.file_client.get(pts_semantic_mask_path)
-            # add .copy() to fix read-only bug
-            pts_semantic_mask = np.frombuffer(
-                mask_bytes, dtype=self.seg_3d_dtype).copy()
-        except ConnectionError:
-            mmcv.check_file_exist(pts_semantic_mask_path)
-            pts_semantic_mask = np.fromfile(
-                pts_semantic_mask_path, dtype=np.int64)
-
-        results['pts_semantic_mask'] = pts_semantic_mask
-        results['pts_seg_fields'].append('pts_semantic_mask')
-        return results
-
-    def __call__(self, results):
-        """Call function to load multiple types annotations.
-
-        Args:
-            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
-
-        Returns:
-            dict: The dict containing loaded 3D bounding box, label, mask and
-                semantic segmentation annotations.
-        """
-        results = super().__call__(results)
-        if self.with_bbox_3d:
-            results = self._load_bboxes_3d(results)
-            if results is None:
-                return None
-        if self.with_bbox_depth:
-            results = self._load_bboxes_depth(results)
-            if results is None:
-                return None
-        if self.with_label_3d:
-            results = self._load_labels_3d(results)
-        if self.with_attr_label:
-            results = self._load_attr_labels(results)
-        if self.with_mask_3d:
-            results = self._load_masks_3d(results)
-        if self.with_seg_3d:
-            results = self._load_semantic_seg_3d(results)
-
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        indent_str = '    '
-        repr_str = self.__class__.__name__ + '(\n'
-        repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
-        repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
-        repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, '
-        repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
-        repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
-        repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
-        repr_str += f'{indent_str}with_label={self.with_label}, '
-        repr_str += f'{indent_str}with_mask={self.with_mask}, '
-        repr_str += f'{indent_str}with_seg={self.with_seg}, '
-        repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, '
-        repr_str += f'{indent_str}poly2mask={self.poly2mask})'
-        return repr_str
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+
+from mmdet3d.core.points import BasePoints, get_points_type
+from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadMultiViewImageFromFiles(object):
+    """Load multi channel images from a list of separate channel files.
+
+    Expects results['img_filename'] to be a list of filenames.
+
+    Args:
+        to_float32 (bool, optional): Whether to convert the img to float32.
+            Defaults to False.
+        color_type (str, optional): Color type of the file.
+            Defaults to 'unchanged'.
+    """
+
+    def __init__(self, to_float32=False, color_type='unchanged'):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+
+    def __call__(self, results):
+        """Call function to load multi-view image from files.
+
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+
+        Returns:
+            dict: The result dict containing the multi-view image data.
+                Added keys and values are described below.
+
+                - filename (str): Multi-view image filenames.
+                - img (np.ndarray): Multi-view image arrays.
+                - img_shape (tuple[int]): Shape of multi-view image arrays.
+                - ori_shape (tuple[int]): Shape of original image arrays.
+                - pad_shape (tuple[int]): Shape of padded image arrays.
+                - scale_factor (float): Scale factor.
+                - img_norm_cfg (dict): Normalization configuration of images.
+        """
+        filename = results['img_filename']
+        # img is of shape (h, w, c, num_views)
+        img = np.stack(
+            [mmcv.imread(name, self.color_type) for name in filename], axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+        results['filename'] = filename
+        # unravel to list, see `DefaultFormatBundle` in formatting.py
+        # which will transpose each image separately and then stack into array
+        results['img'] = [img[..., i] for i in range(img.shape[-1])]
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        # Set initial values for default meta_keys
+        results['pad_shape'] = img.shape
+        results['scale_factor'] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False)
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(to_float32={self.to_float32}, '
+        repr_str += f"color_type='{self.color_type}')"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadImageFromFileMono3D(LoadImageFromFile):
+    """Load an image from file in monocular 3D object detection. Compared to 2D
+    detection, additional camera parameters need to be loaded.
+
+    Args:
+        kwargs (dict): Arguments are the same as those in
+            :class:`LoadImageFromFile`.
+    """
+
+    def __call__(self, results):
+        """Call functions to load image and get image meta information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        super().__call__(results)
+        results['cam2img'] = results['img_info']['cam_intrinsic']
+        return results
+
+
+@PIPELINES.register_module()
+class LoadPointsFromMultiSweeps(object):
+    """Load points from multiple sweeps.
+
+    This is usually used for nuScenes dataset to utilize previous sweeps.
+
+    Args:
+        sweeps_num (int, optional): Number of sweeps. Defaults to 10.
+        load_dim (int, optional): Dimension number of the loaded points.
+            Defaults to 5.
+        use_dim (list[int], optional): Which dimension to use.
+            Defaults to [0, 1, 2, 4].
+        file_client_args (dict, optional): Config dict of file clients,
+            refer to
+            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+            for more details. Defaults to dict(backend='disk').
+        pad_empty_sweeps (bool, optional): Whether to repeat keyframe when
+            sweeps is empty. Defaults to False.
+        remove_close (bool, optional): Whether to remove close points.
+            Defaults to False.
+        test_mode (bool, optional): If `test_mode=True`, it will not
+            randomly sample sweeps but select the nearest N frames.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 sweeps_num=10,
+                 load_dim=5,
+                 use_dim=[0, 1, 2, 4],
+                 file_client_args=dict(backend='disk'),
+                 pad_empty_sweeps=False,
+                 remove_close=False,
+                 test_mode=False):
+        self.load_dim = load_dim
+        self.sweeps_num = sweeps_num
+        self.use_dim = use_dim
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+        self.pad_empty_sweeps = pad_empty_sweeps
+        self.remove_close = remove_close
+        self.test_mode = test_mode
+
+    def _load_points(self, pts_filename):
+        """Private function to load point clouds data.
+
+        Args:
+            pts_filename (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+        try:
+            pts_bytes = self.file_client.get(pts_filename)
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+        except ConnectionError:
+            mmcv.check_file_exist(pts_filename)
+            if pts_filename.endswith('.npy'):
+                points = np.load(pts_filename)
+            else:
+                points = np.fromfile(pts_filename, dtype=np.float32)
+        return points
+
+    def _remove_close(self, points, radius=1.0):
+        """Removes point too close within a certain radius from origin.
+
+        Args:
+            points (np.ndarray | :obj:`BasePoints`): Sweep points.
+            radius (float, optional): Radius below which points are removed.
+                Defaults to 1.0.
+
+        Returns:
+            np.ndarray: Points after removing.
+        """
+        if isinstance(points, np.ndarray):
+            points_numpy = points
+        elif isinstance(points, BasePoints):
+            points_numpy = points.tensor.numpy()
+        else:
+            raise NotImplementedError
+        x_filt = np.abs(points_numpy[:, 0]) < radius
+        y_filt = np.abs(points_numpy[:, 1]) < radius
+        not_close = np.logical_not(np.logical_and(x_filt, y_filt))
+        return points[not_close]
+
+    def __call__(self, results):
+        """Call function to load multi-sweep point clouds from files.
+
+        Args:
+            results (dict): Result dict containing multi-sweep point cloud
+                filenames.
+
+        Returns:
+            dict: The result dict containing the multi-sweep points data.
+                Added key and value are described below.
+
+                - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point
+                    cloud arrays.
+        """
+        points = results['points']
+        points.tensor[:, 4] = 0
+        sweep_points_list = [points]
+        ts = results['timestamp']
+        if self.pad_empty_sweeps and len(results['sweeps']) == 0:
+            for i in range(self.sweeps_num):
+                if self.remove_close:
+                    sweep_points_list.append(self._remove_close(points))
+                else:
+                    sweep_points_list.append(points)
+        else:
+            if len(results['sweeps']) <= self.sweeps_num:
+                choices = np.arange(len(results['sweeps']))
+            elif self.test_mode:
+                choices = np.arange(self.sweeps_num)
+            else:
+                choices = np.random.choice(
+                    len(results['sweeps']), self.sweeps_num, replace=False)
+            for idx in choices:
+                sweep = results['sweeps'][idx]
+                points_sweep = self._load_points(sweep['data_path'])
+                points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim)
+                if self.remove_close:
+                    points_sweep = self._remove_close(points_sweep)
+                sweep_ts = sweep['timestamp'] / 1e6
+                points_sweep[:, :3] = points_sweep[:, :3] @ sweep[
+                    'sensor2lidar_rotation'].T
+                points_sweep[:, :3] += sweep['sensor2lidar_translation']
+                points_sweep[:, 4] = ts - sweep_ts
+                points_sweep = points.new_point(points_sweep)
+                sweep_points_list.append(points_sweep)
+
+        points = points.cat(sweep_points_list)
+        points = points[:, self.use_dim]
+        results['points'] = points
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return f'{self.__class__.__name__}(sweeps_num={self.sweeps_num})'
+
+
+@PIPELINES.register_module()
+class PointSegClassMapping(object):
+    """Map original semantic class to valid category ids.
+
+    Map valid classes as 0~len(valid_cat_ids)-1 and
+    others as len(valid_cat_ids).
+
+    Args:
+        valid_cat_ids (tuple[int]): A tuple of valid category.
+        max_cat_id (int, optional): The max possible cat_id in input
+            segmentation mask. Defaults to 40.
+    """
+
+    def __init__(self, valid_cat_ids, max_cat_id=40):
+        assert max_cat_id >= np.max(valid_cat_ids), \
+            'max_cat_id should be greater than maximum id in valid_cat_ids'
+
+        self.valid_cat_ids = valid_cat_ids
+        self.max_cat_id = int(max_cat_id)
+
+        # build cat_id to class index mapping
+        neg_cls = len(valid_cat_ids)
+        self.cat_id2class = np.ones(
+            self.max_cat_id + 1, dtype=np.int) * neg_cls
+        for cls_idx, cat_id in enumerate(valid_cat_ids):
+            self.cat_id2class[cat_id] = cls_idx
+
+    def __call__(self, results):
+        """Call function to map original semantic class to valid category ids.
+
+        Args:
+            results (dict): Result dict containing point semantic masks.
+
+        Returns:
+            dict: The result dict containing the mapped category ids.
+                Updated key and value are described below.
+
+                - pts_semantic_mask (np.ndarray): Mapped semantic masks.
+        """
+        assert 'pts_semantic_mask' in results
+        pts_semantic_mask = results['pts_semantic_mask']
+
+        converted_pts_sem_mask = self.cat_id2class[pts_semantic_mask]
+
+        results['pts_semantic_mask'] = converted_pts_sem_mask
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(valid_cat_ids={self.valid_cat_ids}, '
+        repr_str += f'max_cat_id={self.max_cat_id})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class NormalizePointsColor(object):
+    """Normalize color of points.
+
+    Args:
+        color_mean (list[float]): Mean color of the point cloud.
+    """
+
+    def __init__(self, color_mean):
+        self.color_mean = color_mean
+
+    def __call__(self, results):
+        """Call function to normalize color of points.
+
+        Args:
+            results (dict): Result dict containing point clouds data.
+
+        Returns:
+            dict: The result dict containing the normalized points.
+                Updated key and value are described below.
+
+                - points (:obj:`BasePoints`): Points after color normalization.
+        """
+        points = results['points']
+        assert points.attribute_dims is not None and \
+            'color' in points.attribute_dims.keys(), \
+            'Expect points have color attribute'
+        if self.color_mean is not None:
+            points.color = points.color - \
+                points.color.new_tensor(self.color_mean)
+        points.color = points.color / 255.0
+        results['points'] = points
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(color_mean={self.color_mean})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadPointsFromFile(object):
+    """Load Points From File.
+
+    Load points from file.
+
+    Args:
+        coord_type (str): The type of coordinates of points cloud.
+            Available options includes:
+            - 'LIDAR': Points in LiDAR coordinates.
+            - 'DEPTH': Points in depth coordinates, usually for indoor dataset.
+            - 'CAMERA': Points in camera coordinates.
+        load_dim (int, optional): The dimension of the loaded points.
+            Defaults to 6.
+        use_dim (list[int], optional): Which dimensions of the points to use.
+            Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
+            or use_dim=[0, 1, 2, 3] to use the intensity dimension.
+        shift_height (bool, optional): Whether to use shifted height.
+            Defaults to False.
+        use_color (bool, optional): Whether to use color features.
+            Defaults to False.
+        file_client_args (dict, optional): Config dict of file clients,
+            refer to
+            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+            for more details. Defaults to dict(backend='disk').
+    """
+
+    def __init__(self,
+                 coord_type,
+                 load_dim=6,
+                 use_dim=[0, 1, 2],
+                 shift_height=False,
+                 use_color=False,
+                 file_client_args=dict(backend='disk')):
+        self.shift_height = shift_height
+        self.use_color = use_color
+        if isinstance(use_dim, int):
+            use_dim = list(range(use_dim))
+        assert max(use_dim) < load_dim, \
+            f'Expect all used dimensions < {load_dim}, got {use_dim}'
+        assert coord_type in ['CAMERA', 'LIDAR', 'DEPTH']
+
+        self.coord_type = coord_type
+        self.load_dim = load_dim
+        self.use_dim = use_dim
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _load_points(self, pts_filename):
+        """Private function to load point clouds data.
+
+        Args:
+            pts_filename (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+        try:
+            pts_bytes = self.file_client.get(pts_filename)
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+        except ConnectionError:
+            mmcv.check_file_exist(pts_filename)
+            if pts_filename.endswith('.npy'):
+                points = np.load(pts_filename)
+            else:
+                points = np.fromfile(pts_filename, dtype=np.float32)
+
+        return points
+
+    def __call__(self, results):
+        """Call function to load points data from file.
+
+        Args:
+            results (dict): Result dict containing point clouds data.
+
+        Returns:
+            dict: The result dict containing the point clouds data.
+                Added key and value are described below.
+
+                - points (:obj:`BasePoints`): Point clouds data.
+        """
+        pts_filename = results['pts_filename']
+        points = self._load_points(pts_filename)
+        points = points.reshape(-1, self.load_dim)
+        points = points[:, self.use_dim]
+        attribute_dims = None
+
+        if self.shift_height:
+            floor_height = np.percentile(points[:, 2], 0.99)
+            height = points[:, 2] - floor_height
+            points = np.concatenate(
+                [points[:, :3],
+                 np.expand_dims(height, 1), points[:, 3:]], 1)
+            attribute_dims = dict(height=3)
+
+        if self.use_color:
+            assert len(self.use_dim) >= 6
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(color=[
+                    points.shape[1] - 3,
+                    points.shape[1] - 2,
+                    points.shape[1] - 1,
+                ]))
+
+        points_class = get_points_type(self.coord_type)
+        points = points_class(
+            points, points_dim=points.shape[-1], attribute_dims=attribute_dims)
+        results['points'] = points
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__ + '('
+        repr_str += f'shift_height={self.shift_height}, '
+        repr_str += f'use_color={self.use_color}, '
+        repr_str += f'file_client_args={self.file_client_args}, '
+        repr_str += f'load_dim={self.load_dim}, '
+        repr_str += f'use_dim={self.use_dim})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadPointsFromDict(LoadPointsFromFile):
+    """Load Points From Dict."""
+
+    def __call__(self, results):
+        assert 'points' in results
+        return results
+
+
+@PIPELINES.register_module()
+class LoadAnnotations3D(LoadAnnotations):
+    """Load Annotations3D.
+
+    Load instance mask and semantic mask of points and
+    encapsulate the items into related fields.
+
+    Args:
+        with_bbox_3d (bool, optional): Whether to load 3D boxes.
+            Defaults to True.
+        with_label_3d (bool, optional): Whether to load 3D labels.
+            Defaults to True.
+        with_attr_label (bool, optional): Whether to load attribute label.
+            Defaults to False.
+        with_mask_3d (bool, optional): Whether to load 3D instance masks.
+            for points. Defaults to False.
+        with_seg_3d (bool, optional): Whether to load 3D semantic masks.
+            for points. Defaults to False.
+        with_bbox (bool, optional): Whether to load 2D boxes.
+            Defaults to False.
+        with_label (bool, optional): Whether to load 2D labels.
+            Defaults to False.
+        with_mask (bool, optional): Whether to load 2D instance masks.
+            Defaults to False.
+        with_seg (bool, optional): Whether to load 2D semantic masks.
+            Defaults to False.
+        with_bbox_depth (bool, optional): Whether to load 2.5D boxes.
+            Defaults to False.
+        poly2mask (bool, optional): Whether to convert polygon annotations
+            to bitmasks. Defaults to True.
+        seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
+            Defaults to int64
+        file_client_args (dict): Config dict of file clients, refer to
+            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+            for more details.
+    """
+
+    def __init__(self,
+                 with_bbox_3d=True,
+                 with_label_3d=True,
+                 with_attr_label=False,
+                 with_mask_3d=False,
+                 with_seg_3d=False,
+                 with_bbox=False,
+                 with_label=False,
+                 with_mask=False,
+                 with_seg=False,
+                 with_bbox_depth=False,
+                 poly2mask=True,
+                 seg_3d_dtype=np.int64,
+                 file_client_args=dict(backend='disk')):
+        super().__init__(
+            with_bbox,
+            with_label,
+            with_mask,
+            with_seg,
+            poly2mask,
+            file_client_args=file_client_args)
+        self.with_bbox_3d = with_bbox_3d
+        self.with_bbox_depth = with_bbox_depth
+        self.with_label_3d = with_label_3d
+        self.with_attr_label = with_attr_label
+        self.with_mask_3d = with_mask_3d
+        self.with_seg_3d = with_seg_3d
+        self.seg_3d_dtype = seg_3d_dtype
+
+    def _load_bboxes_3d(self, results):
+        """Private function to load 3D bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D bounding box annotations.
+        """
+        results['gt_bboxes_3d'] = results['ann_info']['gt_bboxes_3d']
+        results['bbox3d_fields'].append('gt_bboxes_3d')
+        return results
+
+    def _load_bboxes_depth(self, results):
+        """Private function to load 2.5D bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 2.5D bounding box annotations.
+        """
+        results['centers2d'] = results['ann_info']['centers2d']
+        results['depths'] = results['ann_info']['depths']
+        return results
+
+    def _load_labels_3d(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded label annotations.
+        """
+        results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
+        return results
+
+    def _load_attr_labels(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded label annotations.
+        """
+        results['attr_labels'] = results['ann_info']['attr_labels']
+        return results
+
+    def _load_masks_3d(self, results):
+        """Private function to load 3D mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D mask annotations.
+        """
+        pts_instance_mask_path = results['ann_info']['pts_instance_mask_path']
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+        try:
+            mask_bytes = self.file_client.get(pts_instance_mask_path)
+            pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int64)
+        except ConnectionError:
+            mmcv.check_file_exist(pts_instance_mask_path)
+            pts_instance_mask = np.fromfile(
+                pts_instance_mask_path, dtype=np.int64)
+
+        results['pts_instance_mask'] = pts_instance_mask
+        results['pts_mask_fields'].append('pts_instance_mask')
+        return results
+
+    def _load_semantic_seg_3d(self, results):
+        """Private function to load 3D semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing the semantic segmentation annotations.
+        """
+        pts_semantic_mask_path = results['ann_info']['pts_semantic_mask_path']
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+        try:
+            mask_bytes = self.file_client.get(pts_semantic_mask_path)
+            # add .copy() to fix read-only bug
+            pts_semantic_mask = np.frombuffer(
+                mask_bytes, dtype=self.seg_3d_dtype).copy()
+        except ConnectionError:
+            mmcv.check_file_exist(pts_semantic_mask_path)
+            pts_semantic_mask = np.fromfile(
+                pts_semantic_mask_path, dtype=np.int64)
+
+        results['pts_semantic_mask'] = pts_semantic_mask
+        results['pts_seg_fields'].append('pts_semantic_mask')
+        return results
+
+    def __call__(self, results):
+        """Call function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+        results = super().__call__(results)
+        if self.with_bbox_3d:
+            results = self._load_bboxes_3d(results)
+            if results is None:
+                return None
+        if self.with_bbox_depth:
+            results = self._load_bboxes_depth(results)
+            if results is None:
+                return None
+        if self.with_label_3d:
+            results = self._load_labels_3d(results)
+        if self.with_attr_label:
+            results = self._load_attr_labels(results)
+        if self.with_mask_3d:
+            results = self._load_masks_3d(results)
+        if self.with_seg_3d:
+            results = self._load_semantic_seg_3d(results)
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
+        repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
+        repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, '
+        repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
+        repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
+        repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
+        repr_str += f'{indent_str}with_label={self.with_label}, '
+        repr_str += f'{indent_str}with_mask={self.with_mask}, '
+        repr_str += f'{indent_str}with_seg={self.with_seg}, '
+        repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, '
+        repr_str += f'{indent_str}poly2mask={self.poly2mask})'
+        return repr_str
diff --git a/mmdet3d/datasets/pipelines/test_time_aug.py b/mmdet3d/datasets/pipelines/test_time_aug.py
index d53f110..3146570 100644
--- a/mmdet3d/datasets/pipelines/test_time_aug.py
+++ b/mmdet3d/datasets/pipelines/test_time_aug.py
@@ -1,229 +1,229 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from copy import deepcopy
-
-import mmcv
-
-from ..builder import PIPELINES
-from .compose import Compose
-
-
-@PIPELINES.register_module()
-class MultiScaleFlipAug:
-    """Test-time augmentation with multiple scales and flipping. An example
-    configuration is as followed:
-
-    .. code-block::
-        img_scale=[(1333, 400), (1333, 800)],
-        flip=True,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='Pad', size_divisor=32),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ]
-    After MultiScaleFLipAug with above configuration, the results are wrapped
-    into lists of the same length as followed:
-    .. code-block::
-        dict(
-            img=[...],
-            img_shape=[...],
-            scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)]
-            flip=[False, True, False, True]
-            ...
-        )
-    Args:
-        transforms (list[dict]): Transforms to apply in each augmentation.
-        img_scale (tuple | list[tuple] | None): Images scales for resizing.
-        scale_factor (float | list[float] | None): Scale factors for resizing.
-        flip (bool): Whether apply flip augmentation. Default: False.
-        flip_direction (str | list[str]): Flip augmentation directions,
-            options are "horizontal", "vertical" and "diagonal". If
-            flip_direction is a list, multiple flip augmentations will be
-            applied. It has no effect when flip == False. Default:
-            "horizontal".
-    """
-
-    def __init__(self,
-                 transforms,
-                 img_scale=None,
-                 scale_factor=None,
-                 flip=False,
-                 flip_direction='horizontal'):
-        self.transforms = Compose(transforms)
-        assert (img_scale is None) ^ (scale_factor is None), (
-            'Must have but only one variable can be set')
-        if img_scale is not None:
-            self.img_scale = img_scale if isinstance(img_scale,
-                                                     list) else [img_scale]
-            self.scale_key = 'scale'
-            assert mmcv.is_list_of(self.img_scale, tuple)
-        else:
-            self.img_scale = scale_factor if isinstance(
-                scale_factor, list) else [scale_factor]
-            self.scale_key = 'scale_factor'
-
-        self.flip = flip
-        self.flip_direction = flip_direction if isinstance(
-            flip_direction, list) else [flip_direction]
-        assert mmcv.is_list_of(self.flip_direction, str)
-        if not self.flip and self.flip_direction != ['horizontal']:
-            warnings.warn(
-                'flip_direction has no effect when flip is set to False')
-        if (self.flip
-                and not any([t['type'] == 'RandomFlip' for t in transforms])):
-            warnings.warn(
-                'flip has no effect when RandomFlip is not in transforms')
-
-    def __call__(self, results):
-        """Call function to apply test time augment transforms on results.
-
-        Args:
-            results (dict): Result dict contains the data to transform.
-        Returns:
-           dict[str: list]: The augmented data, where each value is wrapped
-               into a list.
-        """
-
-        aug_data = []
-        flip_args = [(False, None)]
-        if self.flip:
-            flip_args += [(True, direction)
-                          for direction in self.flip_direction]
-        for scale in self.img_scale:
-            for flip, direction in flip_args:
-                _results = results.copy()
-                _results[self.scale_key] = scale
-                _results['flip'] = flip
-                _results['flip_direction'] = direction
-                data = self.transforms(_results)
-                aug_data.append(data)
-        # list of dict to dict of list
-        aug_data_dict = {key: [] for key in aug_data[0]}
-        for data in aug_data:
-            for key, val in data.items():
-                aug_data_dict[key].append(val)
-        return aug_data_dict
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(transforms={self.transforms}, '
-        repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
-        repr_str += f'flip_direction={self.flip_direction})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class MultiScaleFlipAug3D(object):
-    """Test-time augmentation with multiple scales and flipping.
-
-    Args:
-        transforms (list[dict]): Transforms to apply in each augmentation.
-        img_scale (tuple | list[tuple]: Images scales for resizing.
-        pts_scale_ratio (float | list[float]): Points scale ratios for
-            resizing.
-        flip (bool, optional): Whether apply flip augmentation.
-            Defaults to False.
-        flip_direction (str | list[str], optional): Flip augmentation
-            directions for images, options are "horizontal" and "vertical".
-            If flip_direction is list, multiple flip augmentations will
-            be applied. It has no effect when ``flip == False``.
-            Defaults to "horizontal".
-        pcd_horizontal_flip (bool, optional): Whether apply horizontal
-            flip augmentation to point cloud. Defaults to True.
-            Note that it works only when 'flip' is turned on.
-        pcd_vertical_flip (bool, optional): Whether apply vertical flip
-            augmentation to point cloud. Defaults to True.
-            Note that it works only when 'flip' is turned on.
-    """
-
-    def __init__(self,
-                 transforms,
-                 img_scale,
-                 pts_scale_ratio,
-                 flip=False,
-                 flip_direction='horizontal',
-                 pcd_horizontal_flip=False,
-                 pcd_vertical_flip=False):
-        self.transforms = Compose(transforms)
-        self.img_scale = img_scale if isinstance(img_scale,
-                                                 list) else [img_scale]
-        self.pts_scale_ratio = pts_scale_ratio \
-            if isinstance(pts_scale_ratio, list) else[float(pts_scale_ratio)]
-
-        assert mmcv.is_list_of(self.img_scale, tuple)
-        assert mmcv.is_list_of(self.pts_scale_ratio, float)
-
-        self.flip = flip
-        self.pcd_horizontal_flip = pcd_horizontal_flip
-        self.pcd_vertical_flip = pcd_vertical_flip
-
-        self.flip_direction = flip_direction if isinstance(
-            flip_direction, list) else [flip_direction]
-        assert mmcv.is_list_of(self.flip_direction, str)
-        if not self.flip and self.flip_direction != ['horizontal']:
-            warnings.warn(
-                'flip_direction has no effect when flip is set to False')
-        if (self.flip and not any([(t['type'] == 'RandomFlip3D'
-                                    or t['type'] == 'RandomFlip')
-                                   for t in transforms])):
-            warnings.warn(
-                'flip has no effect when RandomFlip is not in transforms')
-
-    def __call__(self, results):
-        """Call function to augment common fields in results.
-
-        Args:
-            results (dict): Result dict contains the data to augment.
-
-        Returns:
-            dict: The result dict contains the data that is augmented with
-                different scales and flips.
-        """
-        aug_data = []
-
-        # modified from `flip_aug = [False, True] if self.flip else [False]`
-        # to reduce unnecessary scenes when using double flip augmentation
-        # during test time
-        flip_aug = [True] if self.flip else [False]
-        pcd_horizontal_flip_aug = [False, True] \
-            if self.flip and self.pcd_horizontal_flip else [False]
-        pcd_vertical_flip_aug = [False, True] \
-            if self.flip and self.pcd_vertical_flip else [False]
-        for scale in self.img_scale:
-            for pts_scale_ratio in self.pts_scale_ratio:
-                for flip in flip_aug:
-                    for pcd_horizontal_flip in pcd_horizontal_flip_aug:
-                        for pcd_vertical_flip in pcd_vertical_flip_aug:
-                            for direction in self.flip_direction:
-                                # results.copy will cause bug
-                                # since it is shallow copy
-                                _results = deepcopy(results)
-                                _results['scale'] = scale
-                                _results['flip'] = flip
-                                _results['pcd_scale_factor'] = \
-                                    pts_scale_ratio
-                                _results['flip_direction'] = direction
-                                _results['pcd_horizontal_flip'] = \
-                                    pcd_horizontal_flip
-                                _results['pcd_vertical_flip'] = \
-                                    pcd_vertical_flip
-                                data = self.transforms(_results)
-                                aug_data.append(data)
-        # list of dict to dict of list
-        aug_data_dict = {key: [] for key in aug_data[0]}
-        for data in aug_data:
-            for key, val in data.items():
-                aug_data_dict[key].append(val)
-        return aug_data_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(transforms={self.transforms}, '
-        repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
-        repr_str += f'pts_scale_ratio={self.pts_scale_ratio}, '
-        repr_str += f'flip_direction={self.flip_direction})'
-        return repr_str
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from copy import deepcopy
+
+import mmcv
+
+from ..builder import PIPELINES
+from .compose import Compose
+
+
+@PIPELINES.register_module()
+class MultiScaleFlipAug:
+    """Test-time augmentation with multiple scales and flipping. An example
+    configuration is as followed:
+
+    .. code-block::
+        img_scale=[(1333, 400), (1333, 800)],
+        flip=True,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ]
+    After MultiScaleFLipAug with above configuration, the results are wrapped
+    into lists of the same length as followed:
+    .. code-block::
+        dict(
+            img=[...],
+            img_shape=[...],
+            scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)]
+            flip=[False, True, False, True]
+            ...
+        )
+    Args:
+        transforms (list[dict]): Transforms to apply in each augmentation.
+        img_scale (tuple | list[tuple] | None): Images scales for resizing.
+        scale_factor (float | list[float] | None): Scale factors for resizing.
+        flip (bool): Whether apply flip augmentation. Default: False.
+        flip_direction (str | list[str]): Flip augmentation directions,
+            options are "horizontal", "vertical" and "diagonal". If
+            flip_direction is a list, multiple flip augmentations will be
+            applied. It has no effect when flip == False. Default:
+            "horizontal".
+    """
+
+    def __init__(self,
+                 transforms,
+                 img_scale=None,
+                 scale_factor=None,
+                 flip=False,
+                 flip_direction='horizontal'):
+        self.transforms = Compose(transforms)
+        assert (img_scale is None) ^ (scale_factor is None), (
+            'Must have but only one variable can be set')
+        if img_scale is not None:
+            self.img_scale = img_scale if isinstance(img_scale,
+                                                     list) else [img_scale]
+            self.scale_key = 'scale'
+            assert mmcv.is_list_of(self.img_scale, tuple)
+        else:
+            self.img_scale = scale_factor if isinstance(
+                scale_factor, list) else [scale_factor]
+            self.scale_key = 'scale_factor'
+
+        self.flip = flip
+        self.flip_direction = flip_direction if isinstance(
+            flip_direction, list) else [flip_direction]
+        assert mmcv.is_list_of(self.flip_direction, str)
+        if not self.flip and self.flip_direction != ['horizontal']:
+            warnings.warn(
+                'flip_direction has no effect when flip is set to False')
+        if (self.flip
+                and not any([t['type'] == 'RandomFlip' for t in transforms])):
+            warnings.warn(
+                'flip has no effect when RandomFlip is not in transforms')
+
+    def __call__(self, results):
+        """Call function to apply test time augment transforms on results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+        Returns:
+           dict[str: list]: The augmented data, where each value is wrapped
+               into a list.
+        """
+
+        aug_data = []
+        flip_args = [(False, None)]
+        if self.flip:
+            flip_args += [(True, direction)
+                          for direction in self.flip_direction]
+        for scale in self.img_scale:
+            for flip, direction in flip_args:
+                _results = results.copy()
+                _results[self.scale_key] = scale
+                _results['flip'] = flip
+                _results['flip_direction'] = direction
+                data = self.transforms(_results)
+                aug_data.append(data)
+        # list of dict to dict of list
+        aug_data_dict = {key: [] for key in aug_data[0]}
+        for data in aug_data:
+            for key, val in data.items():
+                aug_data_dict[key].append(val)
+        return aug_data_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms={self.transforms}, '
+        repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
+        repr_str += f'flip_direction={self.flip_direction})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class MultiScaleFlipAug3D(object):
+    """Test-time augmentation with multiple scales and flipping.
+
+    Args:
+        transforms (list[dict]): Transforms to apply in each augmentation.
+        img_scale (tuple | list[tuple]: Images scales for resizing.
+        pts_scale_ratio (float | list[float]): Points scale ratios for
+            resizing.
+        flip (bool, optional): Whether apply flip augmentation.
+            Defaults to False.
+        flip_direction (str | list[str], optional): Flip augmentation
+            directions for images, options are "horizontal" and "vertical".
+            If flip_direction is list, multiple flip augmentations will
+            be applied. It has no effect when ``flip == False``.
+            Defaults to "horizontal".
+        pcd_horizontal_flip (bool, optional): Whether apply horizontal
+            flip augmentation to point cloud. Defaults to True.
+            Note that it works only when 'flip' is turned on.
+        pcd_vertical_flip (bool, optional): Whether apply vertical flip
+            augmentation to point cloud. Defaults to True.
+            Note that it works only when 'flip' is turned on.
+    """
+
+    def __init__(self,
+                 transforms,
+                 img_scale,
+                 pts_scale_ratio,
+                 flip=False,
+                 flip_direction='horizontal',
+                 pcd_horizontal_flip=False,
+                 pcd_vertical_flip=False):
+        self.transforms = Compose(transforms)
+        self.img_scale = img_scale if isinstance(img_scale,
+                                                 list) else [img_scale]
+        self.pts_scale_ratio = pts_scale_ratio \
+            if isinstance(pts_scale_ratio, list) else[float(pts_scale_ratio)]
+
+        assert mmcv.is_list_of(self.img_scale, tuple)
+        assert mmcv.is_list_of(self.pts_scale_ratio, float)
+
+        self.flip = flip
+        self.pcd_horizontal_flip = pcd_horizontal_flip
+        self.pcd_vertical_flip = pcd_vertical_flip
+
+        self.flip_direction = flip_direction if isinstance(
+            flip_direction, list) else [flip_direction]
+        assert mmcv.is_list_of(self.flip_direction, str)
+        if not self.flip and self.flip_direction != ['horizontal']:
+            warnings.warn(
+                'flip_direction has no effect when flip is set to False')
+        if (self.flip and not any([(t['type'] == 'RandomFlip3D'
+                                    or t['type'] == 'RandomFlip')
+                                   for t in transforms])):
+            warnings.warn(
+                'flip has no effect when RandomFlip is not in transforms')
+
+    def __call__(self, results):
+        """Call function to augment common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to augment.
+
+        Returns:
+            dict: The result dict contains the data that is augmented with
+                different scales and flips.
+        """
+        aug_data = []
+
+        # modified from `flip_aug = [False, True] if self.flip else [False]`
+        # to reduce unnecessary scenes when using double flip augmentation
+        # during test time
+        flip_aug = [True] if self.flip else [False]
+        pcd_horizontal_flip_aug = [False, True] \
+            if self.flip and self.pcd_horizontal_flip else [False]
+        pcd_vertical_flip_aug = [False, True] \
+            if self.flip and self.pcd_vertical_flip else [False]
+        for scale in self.img_scale:
+            for pts_scale_ratio in self.pts_scale_ratio:
+                for flip in flip_aug:
+                    for pcd_horizontal_flip in pcd_horizontal_flip_aug:
+                        for pcd_vertical_flip in pcd_vertical_flip_aug:
+                            for direction in self.flip_direction:
+                                # results.copy will cause bug
+                                # since it is shallow copy
+                                _results = deepcopy(results)
+                                _results['scale'] = scale
+                                _results['flip'] = flip
+                                _results['pcd_scale_factor'] = \
+                                    pts_scale_ratio
+                                _results['flip_direction'] = direction
+                                _results['pcd_horizontal_flip'] = \
+                                    pcd_horizontal_flip
+                                _results['pcd_vertical_flip'] = \
+                                    pcd_vertical_flip
+                                data = self.transforms(_results)
+                                aug_data.append(data)
+        # list of dict to dict of list
+        aug_data_dict = {key: [] for key in aug_data[0]}
+        for data in aug_data:
+            for key, val in data.items():
+                aug_data_dict[key].append(val)
+        return aug_data_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms={self.transforms}, '
+        repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
+        repr_str += f'pts_scale_ratio={self.pts_scale_ratio}, '
+        repr_str += f'flip_direction={self.flip_direction})'
+        return repr_str
diff --git a/mmdet3d/datasets/pipelines/transforms_3d.py b/mmdet3d/datasets/pipelines/transforms_3d.py
index b3f92f0..868df54 100644
--- a/mmdet3d/datasets/pipelines/transforms_3d.py
+++ b/mmdet3d/datasets/pipelines/transforms_3d.py
@@ -1,2506 +1,2506 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import random
-import warnings
-
-import cv2
-import numpy as np
-import torch
-from mmcv import is_tuple_of
-from mmcv.utils import build_from_cfg
-
-from mmdet3d.core import VoxelGenerator
-from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes,
-                               LiDARInstance3DBoxes, box_np_ops)
-from mmdet3d.core.bbox.structures import Box3DMode
-from mmdet3d.core.points.lidar_points import LiDARPoints
-from mmdet3d.datasets.pipelines.compose import Compose
-from mmdet.datasets.pipelines import RandomCrop, RandomFlip, Rotate
-from ..builder import OBJECTSAMPLERS, PIPELINES
-from .data_augment_utils import noise_per_object_v3_
-import open3d as o3d
-from data.data_augmentor.augmentation_utils import get_points_in_box
-from data.data_denoisor.denoisor_pcp_utils import ResPCPNet
-from data.data_denoisor.denoisor_dmr_utils import run_denoise_large_pointcloud, run_denoise_middle_pointcloud, run_denoise
-
-@PIPELINES.register_module()
-class RandomDropPointsColor(object):
-    r"""Randomly set the color of points to all zeros.
-
-    Once this transform is executed, all the points' color will be dropped.
-    Refer to `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/
-    util/transform.py#L223>`_ for more details.
-
-    Args:
-        drop_ratio (float, optional): The probability of dropping point colors.
-            Defaults to 0.2.
-    """
-
-    def __init__(self, drop_ratio=0.2):
-        assert isinstance(drop_ratio, (int, float)) and 0 <= drop_ratio <= 1, \
-            f'invalid drop_ratio value {drop_ratio}'
-        self.drop_ratio = drop_ratio
-
-    def __call__(self, input_dict):
-        """Call function to drop point colors.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after color dropping,
-                'points' key is updated in the result dict.
-        """
-        points = input_dict['points']
-        assert points.attribute_dims is not None and \
-            'color' in points.attribute_dims, \
-            'Expect points have color attribute'
-
-        # this if-expression is a bit strange
-        # `RandomDropPointsColor` is used in training 3D segmentor PAConv
-        # we discovered in our experiments that, using
-        # `if np.random.rand() > 1.0 - self.drop_ratio` consistently leads to
-        # better results than using `if np.random.rand() < self.drop_ratio`
-        # so we keep this hack in our codebase
-        if np.random.rand() > 1.0 - self.drop_ratio:
-            points.color = points.color * 0.0
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(drop_ratio={self.drop_ratio})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class RandomFlip3D(RandomFlip):
-    """Flip the points & bbox.
-
-    If the input dict contains the key "flip", then the flag will be used,
-    otherwise it will be randomly decided by a ratio specified in the init
-    method.
-
-    Args:
-        sync_2d (bool, optional): Whether to apply flip according to the 2D
-            images. If True, it will apply the same flip as that to 2D images.
-            If False, it will decide whether to flip randomly and independently
-            to that of 2D images. Defaults to True.
-        flip_ratio_bev_horizontal (float, optional): The flipping probability
-            in horizontal direction. Defaults to 0.0.
-        flip_ratio_bev_vertical (float, optional): The flipping probability
-            in vertical direction. Defaults to 0.0.
-    """
-
-    def __init__(self,
-                 sync_2d=True,
-                 flip_ratio_bev_horizontal=0.0,
-                 flip_ratio_bev_vertical=0.0,
-                 **kwargs):
-        super(RandomFlip3D, self).__init__(
-            flip_ratio=flip_ratio_bev_horizontal, **kwargs)
-        self.sync_2d = sync_2d
-        self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
-        if flip_ratio_bev_horizontal is not None:
-            assert isinstance(
-                flip_ratio_bev_horizontal,
-                (int, float)) and 0 <= flip_ratio_bev_horizontal <= 1
-        if flip_ratio_bev_vertical is not None:
-            assert isinstance(
-                flip_ratio_bev_vertical,
-                (int, float)) and 0 <= flip_ratio_bev_vertical <= 1
-
-    def random_flip_data_3d(self, input_dict, direction='horizontal'):
-        """Flip 3D data randomly.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-            direction (str, optional): Flip direction.
-                Default: 'horizontal'.
-
-        Returns:
-            dict: Flipped results, 'points', 'bbox3d_fields' keys are
-                updated in the result dict.
-        """
-        assert direction in ['horizontal', 'vertical']
-        # for semantic segmentation task, only points will be flipped.
-        if 'bbox3d_fields' not in input_dict:
-            input_dict['points'].flip(direction)
-            return
-        if len(input_dict['bbox3d_fields']) == 0:  # test mode
-            input_dict['bbox3d_fields'].append('empty_box3d')
-            input_dict['empty_box3d'] = input_dict['box_type_3d'](
-                np.array([], dtype=np.float32))
-        assert len(input_dict['bbox3d_fields']) == 1
-        for key in input_dict['bbox3d_fields']:
-            if 'points' in input_dict:
-                input_dict['points'] = input_dict[key].flip(
-                    direction, points=input_dict['points'])
-            else:
-                input_dict[key].flip(direction)
-        if 'centers2d' in input_dict:
-            assert self.sync_2d is True and direction == 'horizontal', \
-                'Only support sync_2d=True and horizontal flip with images'
-            w = input_dict['ori_shape'][1]
-            input_dict['centers2d'][..., 0] = \
-                w - input_dict['centers2d'][..., 0]
-            # need to modify the horizontal position of camera center
-            # along u-axis in the image (flip like centers2d)
-            # ['cam2img'][0][2] = c_u
-            # see more details and examples at
-            # https://github.com/open-mmlab/mmdetection3d/pull/744
-            input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2]
-
-    def __call__(self, input_dict):
-        """Call function to flip points, values in the ``bbox3d_fields`` and
-        also flip 2D image and its annotations.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Flipped results, 'flip', 'flip_direction',
-                'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added
-                into result dict.
-        """
-        # flip 2D image and its annotations
-        super(RandomFlip3D, self).__call__(input_dict)
-
-        if self.sync_2d:
-            input_dict['pcd_horizontal_flip'] = input_dict['flip']
-            input_dict['pcd_vertical_flip'] = False
-        else:
-            if 'pcd_horizontal_flip' not in input_dict:
-                flip_horizontal = True if np.random.rand(
-                ) < self.flip_ratio else False
-                input_dict['pcd_horizontal_flip'] = flip_horizontal
-            if 'pcd_vertical_flip' not in input_dict:
-                flip_vertical = True if np.random.rand(
-                ) < self.flip_ratio_bev_vertical else False
-                input_dict['pcd_vertical_flip'] = flip_vertical
-
-        if 'transformation_3d_flow' not in input_dict:
-            input_dict['transformation_3d_flow'] = []
-
-        if input_dict['pcd_horizontal_flip']:
-            self.random_flip_data_3d(input_dict, 'horizontal')
-            input_dict['transformation_3d_flow'].extend(['HF'])
-        if input_dict['pcd_vertical_flip']:
-            self.random_flip_data_3d(input_dict, 'vertical')
-            input_dict['transformation_3d_flow'].extend(['VF'])
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(sync_2d={self.sync_2d},'
-        repr_str += f' flip_ratio_bev_vertical={self.flip_ratio_bev_vertical})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class MultiViewWrapper(object):
-    """Wrap transformation from single-view into multi-view.
-
-    The wrapper processes the images from multi-view one by one. For each
-    image, it constructs a pseudo dict according to the keys specified by the
-    'process_fields' parameter. After the transformation is finished, desired
-    information can be collected by specifying the keys in the 'collected_keys'
-    parameter. Multi-view images share the same transformation parameters
-    but do not share the same magnitude when a random transformation is
-    conducted.
-
-    Args:
-        transforms (list[dict]): A list of dict specifying the transformations
-            for the monocular situation.
-        process_fields (dict): Desired keys that the transformations should
-            be conducted on. Default to dict(img_fields=['img']).
-        collected_keys (list[str]): Collect information in transformation
-            like rotate angles, crop roi, and flip state.
-    """
-
-    def __init__(self,
-                 transforms,
-                 process_fields=dict(img_fields=['img']),
-                 collected_keys=[]):
-        self.transform = Compose(transforms)
-        self.collected_keys = collected_keys
-        self.process_fields = process_fields
-
-    def __call__(self, input_dict):
-        for key in self.collected_keys:
-            input_dict[key] = []
-        for img_id in range(len(input_dict['img'])):
-            process_dict = self.process_fields.copy()
-            for field in self.process_fields:
-                for key in self.process_fields[field]:
-                    process_dict[key] = input_dict[key][img_id]
-            process_dict = self.transform(process_dict)
-            for field in self.process_fields:
-                for key in self.process_fields[field]:
-                    input_dict[key][img_id] = process_dict[key]
-            for key in self.collected_keys:
-                input_dict[key].append(process_dict[key])
-        return input_dict
-
-
-@PIPELINES.register_module()
-class RangeLimitedRandomCrop(RandomCrop):
-    """Randomly crop image-view objects under a limitation of range.
-
-    Args:
-        relative_x_offset_range (tuple[float]): Relative range of random crop
-            in x direction. (x_min, x_max) in [0, 1.0]. Default to (0.0, 1.0).
-        relative_y_offset_range (tuple[float]): Relative range of random crop
-            in y direction. (y_min, y_max) in [0, 1.0]. Default to (0.0, 1.0).
-    """
-
-    def __init__(self,
-                 relative_x_offset_range=(0.0, 1.0),
-                 relative_y_offset_range=(0.0, 1.0),
-                 **kwargs):
-        super(RangeLimitedRandomCrop, self).__init__(**kwargs)
-        for range in [relative_x_offset_range, relative_y_offset_range]:
-            assert 0 <= range[0] <= range[1] <= 1
-        self.relative_x_offset_range = relative_x_offset_range
-        self.relative_y_offset_range = relative_y_offset_range
-
-    def _crop_data(self, results, crop_size, allow_negative_crop):
-        """Function to randomly crop images.
-
-        Modified from RandomCrop in mmdet==2.25.0
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-            crop_size (tuple): Expected absolute size after cropping, (h, w).
-
-        Returns:
-            dict: Randomly cropped results, 'img_shape' key in result dict is
-                updated according to crop size.
-        """
-        assert crop_size[0] > 0 and crop_size[1] > 0
-        for key in results.get('img_fields', ['img']):
-            img = results[key]
-            margin_h = max(img.shape[0] - crop_size[0], 0)
-            margin_w = max(img.shape[1] - crop_size[1], 0)
-            offset_range_h = (margin_h * self.relative_y_offset_range[0],
-                              margin_h * self.relative_y_offset_range[1] + 1)
-            offset_h = np.random.randint(*offset_range_h)
-            offset_range_w = (margin_w * self.relative_x_offset_range[0],
-                              margin_w * self.relative_x_offset_range[1] + 1)
-            offset_w = np.random.randint(*offset_range_w)
-            crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
-            crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
-
-            # crop the image
-            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
-            img_shape = img.shape
-            results[key] = img
-            results['crop'] = (crop_x1, crop_y1, crop_x2, crop_y2)
-        results['img_shape'] = img_shape
-
-        # crop bboxes accordingly and clip to the image boundary
-        for key in results.get('bbox_fields', []):
-            # e.g. gt_bboxes and gt_bboxes_ignore
-            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
-                                   dtype=np.float32)
-            bboxes = results[key] - bbox_offset
-            if self.bbox_clip_border:
-                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
-                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
-            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
-                bboxes[:, 3] > bboxes[:, 1])
-            # If the crop does not contain any gt-bbox area and
-            # allow_negative_crop is False, skip this image.
-            if (key == 'gt_bboxes' and not valid_inds.any()
-                    and not allow_negative_crop):
-                return None
-            results[key] = bboxes[valid_inds, :]
-            # label fields. e.g. gt_labels and gt_labels_ignore
-            label_key = self.bbox2label.get(key)
-            if label_key in results:
-                results[label_key] = results[label_key][valid_inds]
-
-            # mask fields, e.g. gt_masks and gt_masks_ignore
-            mask_key = self.bbox2mask.get(key)
-            if mask_key in results:
-                results[mask_key] = results[mask_key][
-                    valid_inds.nonzero()[0]].crop(
-                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
-                if self.recompute_bbox:
-                    results[key] = results[mask_key].get_bboxes()
-
-        # crop semantic seg
-        for key in results.get('seg_fields', []):
-            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
-
-        return results
-
-
-@PIPELINES.register_module()
-class RandomRotate(Rotate):
-    """Randomly rotate images.
-
-    The ratation angle is selected uniformly within the interval specified by
-    the 'range'  parameter.
-
-    Args:
-        range (tuple[float]): Define the range of random rotation.
-            (angle_min, angle_max) in angle.
-    """
-
-    def __init__(self, range, **kwargs):
-        super(RandomRotate, self).__init__(**kwargs)
-        self.range = range
-
-    def __call__(self, results):
-        self.angle = np.random.uniform(self.range[0], self.range[1])
-        super(RandomRotate, self).__call__(results)
-        results['rotate'] = self.angle
-        return results
-
-
-@PIPELINES.register_module()
-class RandomJitterPoints(object):
-    """Randomly jitter point coordinates.
-
-    Different from the global translation in ``GlobalRotScaleTrans``, here we
-        apply different noises to each point in a scene.
-
-    Args:
-        jitter_std (list[float]): The standard deviation of jittering noise.
-            This applies random noise to all points in a 3D scene, which is
-            sampled from a gaussian distribution whose standard deviation is
-            set by ``jitter_std``. Defaults to [0.01, 0.01, 0.01]
-        clip_range (list[float]): Clip the randomly generated jitter
-            noise into this range. If None is given, don't perform clipping.
-            Defaults to [-0.05, 0.05]
-
-    Note:
-        This transform should only be used in point cloud segmentation tasks
-            because we don't transform ground-truth bboxes accordingly.
-        For similar transform in detection task, please refer to `ObjectNoise`.
-    """
-
-    def __init__(self,
-                 jitter_std=[0.01, 0.01, 0.01],
-                 clip_range=[-0.05, 0.05]):
-        seq_types = (list, tuple, np.ndarray)
-        if not isinstance(jitter_std, seq_types):
-            assert isinstance(jitter_std, (int, float)), \
-                f'unsupported jitter_std type {type(jitter_std)}'
-            jitter_std = [jitter_std, jitter_std, jitter_std]
-        self.jitter_std = jitter_std
-
-        if clip_range is not None:
-            if not isinstance(clip_range, seq_types):
-                assert isinstance(clip_range, (int, float)), \
-                    f'unsupported clip_range type {type(clip_range)}'
-                clip_range = [-clip_range, clip_range]
-        self.clip_range = clip_range
-
-    def __call__(self, input_dict):
-        """Call function to jitter all the points in the scene.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after adding noise to each point,
-                'points' key is updated in the result dict.
-        """
-        points = input_dict['points']
-        jitter_std = np.array(self.jitter_std, dtype=np.float32)
-        jitter_noise = \
-            np.random.randn(points.shape[0], 3) * jitter_std[None, :]
-        if self.clip_range is not None:
-            jitter_noise = np.clip(jitter_noise, self.clip_range[0],
-                                   self.clip_range[1])
-
-        points.translate(jitter_noise)
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(jitter_std={self.jitter_std},'
-        repr_str += f' clip_range={self.clip_range})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class ObjectSample(object):
-    """Sample GT objects to the data.
-
-    Args:
-        db_sampler (dict): Config dict of the database sampler.
-        sample_2d (bool): Whether to also paste 2D image patch to the images
-            This should be true when applying multi-modality cut-and-paste.
-            Defaults to False.
-        use_ground_plane (bool): Whether to use gound plane to adjust the
-            3D labels.
-    """
-
-    def __init__(self, db_sampler, sample_2d=False, use_ground_plane=False):
-        self.sampler_cfg = db_sampler
-        self.sample_2d = sample_2d
-        if 'type' not in db_sampler.keys():
-            db_sampler['type'] = 'DataBaseSampler'
-        self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS)
-        self.use_ground_plane = use_ground_plane
-
-    @staticmethod
-    def remove_points_in_boxes(points, boxes):
-        """Remove the points in the sampled bounding boxes.
-
-        Args:
-            points (:obj:`BasePoints`): Input point cloud array.
-            boxes (np.ndarray): Sampled ground truth boxes.
-
-        Returns:
-            np.ndarray: Points with those in the boxes removed.
-        """
-        masks = box_np_ops.points_in_rbbox(points.coord.numpy(), boxes)
-        points = points[np.logical_not(masks.any(-1))]
-        return points
-
-    def __call__(self, input_dict):
-        """Call function to sample ground truth objects to the data.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after object sampling augmentation,
-                'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated
-                in the result dict.
-        """
-        gt_bboxes_3d = input_dict['gt_bboxes_3d']
-        gt_labels_3d = input_dict['gt_labels_3d']
-
-        if self.use_ground_plane and 'plane' in input_dict['ann_info']:
-            ground_plane = input_dict['ann_info']['plane']
-            input_dict['plane'] = ground_plane
-        else:
-            ground_plane = None
-        # change to float for blending operation
-        points = input_dict['points']
-        if self.sample_2d:
-            img = input_dict['img']
-            gt_bboxes_2d = input_dict['gt_bboxes']
-            # Assume for now 3D & 2D bboxes are the same
-            sampled_dict = self.db_sampler.sample_all(
-                gt_bboxes_3d.tensor.numpy(),
-                gt_labels_3d,
-                gt_bboxes_2d=gt_bboxes_2d,
-                img=img)
-        else:
-            sampled_dict = self.db_sampler.sample_all(
-                gt_bboxes_3d.tensor.numpy(),
-                gt_labels_3d,
-                img=None,
-                ground_plane=ground_plane)
-
-        if sampled_dict is not None:
-            sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d']
-            sampled_points = sampled_dict['points']
-            sampled_gt_labels = sampled_dict['gt_labels_3d']
-
-            gt_labels_3d = np.concatenate([gt_labels_3d, sampled_gt_labels],
-                                          axis=0)
-            gt_bboxes_3d = gt_bboxes_3d.new_box(
-                np.concatenate(
-                    [gt_bboxes_3d.tensor.numpy(), sampled_gt_bboxes_3d]))
-
-            points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d)
-            # check the points dimension
-            points = points.cat([sampled_points, points])
-
-            if self.sample_2d:
-                sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d']
-                gt_bboxes_2d = np.concatenate(
-                    [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32)
-
-                input_dict['gt_bboxes'] = gt_bboxes_2d
-                input_dict['img'] = sampled_dict['img']
-
-        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
-        input_dict['gt_labels_3d'] = gt_labels_3d.astype(np.int64)
-        input_dict['points'] = points
-
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f' sample_2d={self.sample_2d},'
-        repr_str += f' data_root={self.sampler_cfg.data_root},'
-        repr_str += f' info_path={self.sampler_cfg.info_path},'
-        repr_str += f' rate={self.sampler_cfg.rate},'
-        repr_str += f' prepare={self.sampler_cfg.prepare},'
-        repr_str += f' classes={self.sampler_cfg.classes},'
-        repr_str += f' sample_groups={self.sampler_cfg.sample_groups}'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class ObjectNoise(object):
-    """Apply noise to each GT objects in the scene.
-
-    Args:
-        translation_std (list[float], optional): Standard deviation of the
-            distribution where translation noise are sampled from.
-            Defaults to [0.25, 0.25, 0.25].
-        global_rot_range (list[float], optional): Global rotation to the scene.
-            Defaults to [0.0, 0.0].
-        rot_range (list[float], optional): Object rotation range.
-            Defaults to [-0.15707963267, 0.15707963267].
-        num_try (int, optional): Number of times to try if the noise applied is
-            invalid. Defaults to 100.
-    """
-
-    def __init__(self,
-                 translation_std=[0.25, 0.25, 0.25],
-                 global_rot_range=[0.0, 0.0],
-                 rot_range=[-0.15707963267, 0.15707963267],
-                 num_try=100):
-        self.translation_std = translation_std
-        self.global_rot_range = global_rot_range
-        self.rot_range = rot_range
-        self.num_try = num_try
-
-    def __call__(self, input_dict):
-        """Call function to apply noise to each ground truth in the scene.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after adding noise to each object,
-                'points', 'gt_bboxes_3d' keys are updated in the result dict.
-        """
-        gt_bboxes_3d = input_dict['gt_bboxes_3d']
-        points = input_dict['points']
-
-        # TODO: this is inplace operation
-        numpy_box = gt_bboxes_3d.tensor.numpy()
-        numpy_points = points.tensor.numpy()
-
-        noise_per_object_v3_(
-            numpy_box,
-            numpy_points,
-            rotation_perturb=self.rot_range,
-            center_noise_std=self.translation_std,
-            global_random_rot_range=self.global_rot_range,
-            num_try=self.num_try)
-
-        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.new_box(numpy_box)
-        input_dict['points'] = points.new_point(numpy_points)
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(num_try={self.num_try},'
-        repr_str += f' translation_std={self.translation_std},'
-        repr_str += f' global_rot_range={self.global_rot_range},'
-        repr_str += f' rot_range={self.rot_range})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class GlobalAlignment(object):
-    """Apply global alignment to 3D scene points by rotation and translation.
-
-    Args:
-        rotation_axis (int): Rotation axis for points and bboxes rotation.
-
-    Note:
-        We do not record the applied rotation and translation as in
-            GlobalRotScaleTrans. Because usually, we do not need to reverse
-            the alignment step.
-        For example, ScanNet 3D detection task uses aligned ground-truth
-            bounding boxes for evaluation.
-    """
-
-    def __init__(self, rotation_axis):
-        self.rotation_axis = rotation_axis
-
-    def _trans_points(self, input_dict, trans_factor):
-        """Private function to translate points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-            trans_factor (np.ndarray): Translation vector to be applied.
-
-        Returns:
-            dict: Results after translation, 'points' is updated in the dict.
-        """
-        input_dict['points'].translate(trans_factor)
-
-    def _rot_points(self, input_dict, rot_mat):
-        """Private function to rotate bounding boxes and points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-            rot_mat (np.ndarray): Rotation matrix to be applied.
-
-        Returns:
-            dict: Results after rotation, 'points' is updated in the dict.
-        """
-        # input should be rot_mat_T so I transpose it here
-        input_dict['points'].rotate(rot_mat.T)
-
-    def _check_rot_mat(self, rot_mat):
-        """Check if rotation matrix is valid for self.rotation_axis.
-
-        Args:
-            rot_mat (np.ndarray): Rotation matrix to be applied.
-        """
-        is_valid = np.allclose(np.linalg.det(rot_mat), 1.0)
-        valid_array = np.zeros(3)
-        valid_array[self.rotation_axis] = 1.0
-        is_valid &= (rot_mat[self.rotation_axis, :] == valid_array).all()
-        is_valid &= (rot_mat[:, self.rotation_axis] == valid_array).all()
-        assert is_valid, f'invalid rotation matrix {rot_mat}'
-
-    def __call__(self, input_dict):
-        """Call function to shuffle points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after global alignment, 'points' and keys in
-                input_dict['bbox3d_fields'] are updated in the result dict.
-        """
-        assert 'axis_align_matrix' in input_dict['ann_info'].keys(), \
-            'axis_align_matrix is not provided in GlobalAlignment'
-
-        axis_align_matrix = input_dict['ann_info']['axis_align_matrix']
-        assert axis_align_matrix.shape == (4, 4), \
-            f'invalid shape {axis_align_matrix.shape} for axis_align_matrix'
-        rot_mat = axis_align_matrix[:3, :3]
-        trans_vec = axis_align_matrix[:3, -1]
-
-        self._check_rot_mat(rot_mat)
-        self._rot_points(input_dict, rot_mat)
-        self._trans_points(input_dict, trans_vec)
-
-        return input_dict
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(rotation_axis={self.rotation_axis})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class GlobalRotScaleTrans(object):
-    """Apply global rotation, scaling and translation to a 3D scene.
-
-    Args:
-        rot_range (list[float], optional): Range of rotation angle.
-            Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
-        scale_ratio_range (list[float], optional): Range of scale ratio.
-            Defaults to [0.95, 1.05].
-        translation_std (list[float], optional): The standard deviation of
-            translation noise applied to a scene, which
-            is sampled from a gaussian distribution whose standard deviation
-            is set by ``translation_std``. Defaults to [0, 0, 0]
-        shift_height (bool, optional): Whether to shift height.
-            (the fourth dimension of indoor points) when scaling.
-            Defaults to False.
-    """
-
-    def __init__(self,
-                 rot_range=[-0.78539816, 0.78539816],
-                 scale_ratio_range=[0.95, 1.05],
-                 translation_std=[0, 0, 0],
-                 shift_height=False):
-        seq_types = (list, tuple, np.ndarray)
-        if not isinstance(rot_range, seq_types):
-            assert isinstance(rot_range, (int, float)), \
-                f'unsupported rot_range type {type(rot_range)}'
-            rot_range = [-rot_range, rot_range]
-        self.rot_range = rot_range
-
-        assert isinstance(scale_ratio_range, seq_types), \
-            f'unsupported scale_ratio_range type {type(scale_ratio_range)}'
-        self.scale_ratio_range = scale_ratio_range
-
-        if not isinstance(translation_std, seq_types):
-            assert isinstance(translation_std, (int, float)), \
-                f'unsupported translation_std type {type(translation_std)}'
-            translation_std = [
-                translation_std, translation_std, translation_std
-            ]
-        assert all([std >= 0 for std in translation_std]), \
-            'translation_std should be positive'
-        self.translation_std = translation_std
-        self.shift_height = shift_height
-
-    def _trans_bbox_points(self, input_dict):
-        """Private function to translate bounding boxes and points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after translation, 'points', 'pcd_trans'
-                and keys in input_dict['bbox3d_fields'] are updated
-                in the result dict.
-        """
-        translation_std = np.array(self.translation_std, dtype=np.float32)
-        trans_factor = np.random.normal(scale=translation_std, size=3).T
-
-        input_dict['points'].translate(trans_factor)
-        input_dict['pcd_trans'] = trans_factor
-        for key in input_dict['bbox3d_fields']:
-            input_dict[key].translate(trans_factor)
-
-    def _rot_bbox_points(self, input_dict):
-        """Private function to rotate bounding boxes and points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after rotation, 'points', 'pcd_rotation'
-                and keys in input_dict['bbox3d_fields'] are updated
-                in the result dict.
-        """
-        rotation = self.rot_range
-        noise_rotation = np.random.uniform(rotation[0], rotation[1])
-
-        # if no bbox in input_dict, only rotate points
-        if len(input_dict['bbox3d_fields']) == 0:
-            rot_mat_T = input_dict['points'].rotate(noise_rotation)
-            input_dict['pcd_rotation'] = rot_mat_T
-            input_dict['pcd_rotation_angle'] = noise_rotation
-            return
-
-        # rotate points with bboxes
-        for key in input_dict['bbox3d_fields']:
-            if len(input_dict[key].tensor) != 0:
-                points, rot_mat_T = input_dict[key].rotate(
-                    noise_rotation, input_dict['points'])
-                input_dict['points'] = points
-                input_dict['pcd_rotation'] = rot_mat_T
-                input_dict['pcd_rotation_angle'] = noise_rotation
-
-    def _scale_bbox_points(self, input_dict):
-        """Private function to scale bounding boxes and points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after scaling, 'points'and keys in
-                input_dict['bbox3d_fields'] are updated in the result dict.
-        """
-        scale = input_dict['pcd_scale_factor']
-        points = input_dict['points']
-        points.scale(scale)
-        if self.shift_height:
-            assert 'height' in points.attribute_dims.keys(), \
-                'setting shift_height=True but points have no height attribute'
-            points.tensor[:, points.attribute_dims['height']] *= scale
-        input_dict['points'] = points
-
-        for key in input_dict['bbox3d_fields']:
-            input_dict[key].scale(scale)
-
-    def _random_scale(self, input_dict):
-        """Private function to randomly set the scale factor.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after scaling, 'pcd_scale_factor' are updated
-                in the result dict.
-        """
-        scale_factor = np.random.uniform(self.scale_ratio_range[0],
-                                         self.scale_ratio_range[1])
-        input_dict['pcd_scale_factor'] = scale_factor
-
-    def __call__(self, input_dict):
-        """Private function to rotate, scale and translate bounding boxes and
-        points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after scaling, 'points', 'pcd_rotation',
-                'pcd_scale_factor', 'pcd_trans' and keys in
-                input_dict['bbox3d_fields'] are updated in the result dict.
-        """
-        if 'transformation_3d_flow' not in input_dict:
-            input_dict['transformation_3d_flow'] = []
-
-        self._rot_bbox_points(input_dict)
-
-        if 'pcd_scale_factor' not in input_dict:
-            self._random_scale(input_dict)
-        self._scale_bbox_points(input_dict)
-
-        self._trans_bbox_points(input_dict)
-
-        input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(rot_range={self.rot_range},'
-        repr_str += f' scale_ratio_range={self.scale_ratio_range},'
-        repr_str += f' translation_std={self.translation_std},'
-        repr_str += f' shift_height={self.shift_height})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class PointShuffle(object):
-    """Shuffle input points."""
-
-    def __call__(self, input_dict):
-        """Call function to shuffle points.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after filtering, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        idx = input_dict['points'].shuffle()
-        idx = idx.numpy()
-
-        pts_instance_mask = input_dict.get('pts_instance_mask', None)
-        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
-
-        if pts_instance_mask is not None:
-            input_dict['pts_instance_mask'] = pts_instance_mask[idx]
-
-        if pts_semantic_mask is not None:
-            input_dict['pts_semantic_mask'] = pts_semantic_mask[idx]
-
-        return input_dict
-
-    def __repr__(self):
-        return self.__class__.__name__
-
-
-@PIPELINES.register_module()
-class ObjectRangeFilter(object):
-    """Filter objects by the range.
-
-    Args:
-        point_cloud_range (list[float]): Point cloud range.
-    """
-
-    def __init__(self, point_cloud_range):
-        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
-
-    def __call__(self, input_dict):
-        """Call function to filter objects by the range.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
-                keys are updated in the result dict.
-        """
-        # Check points instance type and initialise bev_range
-        if isinstance(input_dict['gt_bboxes_3d'],
-                      (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
-            bev_range = self.pcd_range[[0, 1, 3, 4]]
-        elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
-            bev_range = self.pcd_range[[0, 2, 3, 5]]
-
-        gt_bboxes_3d = input_dict['gt_bboxes_3d']
-        gt_labels_3d = input_dict['gt_labels_3d']
-        mask = gt_bboxes_3d.in_range_bev(bev_range)
-        gt_bboxes_3d = gt_bboxes_3d[mask]
-        # mask is a torch tensor but gt_labels_3d is still numpy array
-        # using mask to index gt_labels_3d will cause bug when
-        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
-        # as gt_labels_3d[1] and cause out of index error
-        gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
-
-        # limit rad to [-pi, pi]
-        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
-        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
-        input_dict['gt_labels_3d'] = gt_labels_3d
-
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class PointsRangeFilter(object):
-    """Filter points by the range.
-
-    Args:
-        point_cloud_range (list[float]): Point cloud range.
-    """
-
-    def __init__(self, point_cloud_range):
-        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
-
-    def __call__(self, input_dict):
-        """Call function to filter points by the range.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after filtering, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        points = input_dict['points']
-        points_mask = points.in_range_3d(self.pcd_range)
-        clean_points = points[points_mask]
-        input_dict['points'] = clean_points
-        points_mask = points_mask.numpy()
-
-        pts_instance_mask = input_dict.get('pts_instance_mask', None)
-        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
-
-        if pts_instance_mask is not None:
-            input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
-
-        if pts_semantic_mask is not None:
-            input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
-
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class ObjectNameFilter(object):
-    """Filter GT objects by their names.
-
-    Args:
-        classes (list[str]): List of class names to be kept for training.
-    """
-
-    def __init__(self, classes):
-        self.classes = classes
-        self.labels = list(range(len(self.classes)))
-
-    def __call__(self, input_dict):
-        """Call function to filter objects by their names.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
-                keys are updated in the result dict.
-        """
-        gt_labels_3d = input_dict['gt_labels_3d']
-        gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
-                                  dtype=np.bool_)
-        input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
-        input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
-
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(classes={self.classes})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class PointSample(object):
-    """Point sample.
-
-    Sampling data to a certain number.
-
-    Args:
-        num_points (int): Number of points to be sampled.
-        sample_range (float, optional): The range where to sample points.
-            If not None, the points with depth larger than `sample_range` are
-            prior to be sampled. Defaults to None.
-        replace (bool, optional): Whether the sampling is with or without
-            replacement. Defaults to False.
-    """
-
-    def __init__(self, num_points, sample_range=None, replace=False):
-        self.num_points = num_points
-        self.sample_range = sample_range
-        self.replace = replace
-
-    def _points_random_sampling(self,
-                                points,
-                                num_samples,
-                                sample_range=None,
-                                replace=False,
-                                return_choices=False):
-        """Points random sampling.
-
-        Sample points to a certain number.
-
-        Args:
-            points (np.ndarray | :obj:`BasePoints`): 3D Points.
-            num_samples (int): Number of samples to be sampled.
-            sample_range (float, optional): Indicating the range where the
-                points will be sampled. Defaults to None.
-            replace (bool, optional): Sampling with or without replacement.
-                Defaults to None.
-            return_choices (bool, optional): Whether return choice.
-                Defaults to False.
-        Returns:
-            tuple[np.ndarray] | np.ndarray:
-                - points (np.ndarray | :obj:`BasePoints`): 3D Points.
-                - choices (np.ndarray, optional): The generated random samples.
-        """
-        if not replace:
-            replace = (points.shape[0] < num_samples)
-        point_range = range(len(points))
-        if sample_range is not None and not replace:
-            # Only sampling the near points when len(points) >= num_samples
-            dist = np.linalg.norm(points.tensor, axis=1)
-            far_inds = np.where(dist >= sample_range)[0]
-            near_inds = np.where(dist < sample_range)[0]
-            # in case there are too many far points
-            if len(far_inds) > num_samples:
-                far_inds = np.random.choice(
-                    far_inds, num_samples, replace=False)
-            point_range = near_inds
-            num_samples -= len(far_inds)
-        choices = np.random.choice(point_range, num_samples, replace=replace)
-        if sample_range is not None and not replace:
-            choices = np.concatenate((far_inds, choices))
-            # Shuffle points after sampling
-            np.random.shuffle(choices)
-        if return_choices:
-            return points[choices], choices
-        else:
-            return points[choices]
-
-    def __call__(self, results):
-        """Call function to sample points to in indoor scenes.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Results after sampling, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        points = results['points']
-        points, choices = self._points_random_sampling(
-            points,
-            self.num_points,
-            self.sample_range,
-            self.replace,
-            return_choices=True)
-        results['points'] = points
-
-        pts_instance_mask = results.get('pts_instance_mask', None)
-        pts_semantic_mask = results.get('pts_semantic_mask', None)
-
-        if pts_instance_mask is not None:
-            pts_instance_mask = pts_instance_mask[choices]
-            results['pts_instance_mask'] = pts_instance_mask
-
-        if pts_semantic_mask is not None:
-            pts_semantic_mask = pts_semantic_mask[choices]
-            results['pts_semantic_mask'] = pts_semantic_mask
-
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(num_points={self.num_points},'
-        repr_str += f' sample_range={self.sample_range},'
-        repr_str += f' replace={self.replace})'
-
-        return repr_str
-
-
-@PIPELINES.register_module()
-class IndoorPointSample(PointSample):
-    """Indoor point sample.
-
-    Sampling data to a certain number.
-    NOTE: IndoorPointSample is deprecated in favor of PointSample
-
-    Args:
-        num_points (int): Number of points to be sampled.
-    """
-
-    def __init__(self, *args, **kwargs):
-        warnings.warn(
-            'IndoorPointSample is deprecated in favor of PointSample')
-        super(IndoorPointSample, self).__init__(*args, **kwargs)
-
-
-@PIPELINES.register_module()
-class IndoorPatchPointSample(object):
-    r"""Indoor point sample within a patch. Modified from `PointNet++ <https://
-    github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py>`_.
-
-    Sampling data to a certain number for semantic segmentation.
-
-    Args:
-        num_points (int): Number of points to be sampled.
-        block_size (float, optional): Size of a block to sample points from.
-            Defaults to 1.5.
-        sample_rate (float, optional): Stride used in sliding patch generation.
-            This parameter is unused in `IndoorPatchPointSample` and thus has
-            been deprecated. We plan to remove it in the future.
-            Defaults to None.
-        ignore_index (int, optional): Label index that won't be used for the
-            segmentation task. This is set in PointSegClassMapping as neg_cls.
-            If not None, will be used as a patch selection criterion.
-            Defaults to None.
-        use_normalized_coord (bool, optional): Whether to use normalized xyz as
-            additional features. Defaults to False.
-        num_try (int, optional): Number of times to try if the patch selected
-            is invalid. Defaults to 10.
-        enlarge_size (float, optional): Enlarge the sampled patch to
-            [-block_size / 2 - enlarge_size, block_size / 2 + enlarge_size] as
-            an augmentation. If None, set it as 0. Defaults to 0.2.
-        min_unique_num (int, optional): Minimum number of unique points
-            the sampled patch should contain. If None, use PointNet++'s method
-            to judge uniqueness. Defaults to None.
-        eps (float, optional): A value added to patch boundary to guarantee
-            points coverage. Defaults to 1e-2.
-
-    Note:
-        This transform should only be used in the training process of point
-            cloud segmentation tasks. For the sliding patch generation and
-            inference process in testing, please refer to the `slide_inference`
-            function of `EncoderDecoder3D` class.
-    """
-
-    def __init__(self,
-                 num_points,
-                 block_size=1.5,
-                 sample_rate=None,
-                 ignore_index=None,
-                 use_normalized_coord=False,
-                 num_try=10,
-                 enlarge_size=0.2,
-                 min_unique_num=None,
-                 eps=1e-2):
-        self.num_points = num_points
-        self.block_size = block_size
-        self.ignore_index = ignore_index
-        self.use_normalized_coord = use_normalized_coord
-        self.num_try = num_try
-        self.enlarge_size = enlarge_size if enlarge_size is not None else 0.0
-        self.min_unique_num = min_unique_num
-        self.eps = eps
-
-        if sample_rate is not None:
-            warnings.warn(
-                "'sample_rate' has been deprecated and will be removed in "
-                'the future. Please remove them from your code.')
-
-    def _input_generation(self, coords, patch_center, coord_max, attributes,
-                          attribute_dims, point_type):
-        """Generating model input.
-
-        Generate input by subtracting patch center and adding additional
-            features. Currently support colors and normalized xyz as features.
-
-        Args:
-            coords (np.ndarray): Sampled 3D Points.
-            patch_center (np.ndarray): Center coordinate of the selected patch.
-            coord_max (np.ndarray): Max coordinate of all 3D Points.
-            attributes (np.ndarray): features of input points.
-            attribute_dims (dict): Dictionary to indicate the meaning of extra
-                dimension.
-            point_type (type): class of input points inherited from BasePoints.
-
-        Returns:
-            :obj:`BasePoints`: The generated input data.
-        """
-        # subtract patch center, the z dimension is not centered
-        centered_coords = coords.copy()
-        centered_coords[:, 0] -= patch_center[0]
-        centered_coords[:, 1] -= patch_center[1]
-
-        if self.use_normalized_coord:
-            normalized_coord = coords / coord_max
-            attributes = np.concatenate([attributes, normalized_coord], axis=1)
-            if attribute_dims is None:
-                attribute_dims = dict()
-            attribute_dims.update(
-                dict(normalized_coord=[
-                    attributes.shape[1], attributes.shape[1] +
-                    1, attributes.shape[1] + 2
-                ]))
-
-        points = np.concatenate([centered_coords, attributes], axis=1)
-        points = point_type(
-            points, points_dim=points.shape[1], attribute_dims=attribute_dims)
-
-        return points
-
-    def _patch_points_sampling(self, points, sem_mask):
-        """Patch points sampling.
-
-        First sample a valid patch.
-        Then sample points within that patch to a certain number.
-
-        Args:
-            points (:obj:`BasePoints`): 3D Points.
-            sem_mask (np.ndarray): semantic segmentation mask for input points.
-
-        Returns:
-            tuple[:obj:`BasePoints`, np.ndarray] | :obj:`BasePoints`:
-
-                - points (:obj:`BasePoints`): 3D Points.
-                - choices (np.ndarray): The generated random samples.
-        """
-        coords = points.coord.numpy()
-        attributes = points.tensor[:, 3:].numpy()
-        attribute_dims = points.attribute_dims
-        point_type = type(points)
-
-        coord_max = np.amax(coords, axis=0)
-        coord_min = np.amin(coords, axis=0)
-
-        for _ in range(self.num_try):
-            # random sample a point as patch center
-            cur_center = coords[np.random.choice(coords.shape[0])]
-
-            # boundary of a patch, which would be enlarged by
-            # `self.enlarge_size` as an augmentation
-            cur_max = cur_center + np.array(
-                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
-            cur_min = cur_center - np.array(
-                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
-            cur_max[2] = coord_max[2]
-            cur_min[2] = coord_min[2]
-            cur_choice = np.sum(
-                (coords >= (cur_min - self.enlarge_size)) *
-                (coords <= (cur_max + self.enlarge_size)),
-                axis=1) == 3
-
-            if not cur_choice.any():  # no points in this patch
-                continue
-
-            cur_coords = coords[cur_choice, :]
-            cur_sem_mask = sem_mask[cur_choice]
-            point_idxs = np.where(cur_choice)[0]
-            mask = np.sum(
-                (cur_coords >= (cur_min - self.eps)) * (cur_coords <=
-                                                        (cur_max + self.eps)),
-                axis=1) == 3
-
-            # two criteria for patch sampling, adopted from PointNet++
-            # 1. selected patch should contain enough unique points
-            if self.min_unique_num is None:
-                # use PointNet++'s method as default
-                # [31, 31, 62] are just some big values used to transform
-                # coords from 3d array to 1d and then check their uniqueness
-                # this is used in all the ScanNet code following PointNet++
-                vidx = np.ceil(
-                    (cur_coords[mask, :] - cur_min) / (cur_max - cur_min) *
-                    np.array([31.0, 31.0, 62.0]))
-                vidx = np.unique(vidx[:, 0] * 31.0 * 62.0 + vidx[:, 1] * 62.0 +
-                                 vidx[:, 2])
-                flag1 = len(vidx) / 31.0 / 31.0 / 62.0 >= 0.02
-            else:
-                # if `min_unique_num` is provided, directly compare with it
-                flag1 = mask.sum() >= self.min_unique_num
-
-            # 2. selected patch should contain enough annotated points
-            if self.ignore_index is None:
-                flag2 = True
-            else:
-                flag2 = np.sum(cur_sem_mask != self.ignore_index) / \
-                               len(cur_sem_mask) >= 0.7
-
-            if flag1 and flag2:
-                break
-
-        # sample idx to `self.num_points`
-        if point_idxs.size >= self.num_points:
-            # no duplicate in sub-sampling
-            choices = np.random.choice(
-                point_idxs, self.num_points, replace=False)
-        else:
-            # do not use random choice here to avoid some points not counted
-            dup = np.random.choice(point_idxs.size,
-                                   self.num_points - point_idxs.size)
-            idx_dup = np.concatenate(
-                [np.arange(point_idxs.size),
-                 np.array(dup)], 0)
-            choices = point_idxs[idx_dup]
-
-        # construct model input
-        points = self._input_generation(coords[choices], cur_center, coord_max,
-                                        attributes[choices], attribute_dims,
-                                        point_type)
-
-        return points, choices
-
-    def __call__(self, results):
-        """Call function to sample points to in indoor scenes.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after sampling, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        points = results['points']
-
-        assert 'pts_semantic_mask' in results.keys(), \
-            'semantic mask should be provided in training and evaluation'
-        pts_semantic_mask = results['pts_semantic_mask']
-
-        points, choices = self._patch_points_sampling(points,
-                                                      pts_semantic_mask)
-
-        results['points'] = points
-        results['pts_semantic_mask'] = pts_semantic_mask[choices]
-        pts_instance_mask = results.get('pts_instance_mask', None)
-        if pts_instance_mask is not None:
-            results['pts_instance_mask'] = pts_instance_mask[choices]
-
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(num_points={self.num_points},'
-        repr_str += f' block_size={self.block_size},'
-        repr_str += f' ignore_index={self.ignore_index},'
-        repr_str += f' use_normalized_coord={self.use_normalized_coord},'
-        repr_str += f' num_try={self.num_try},'
-        repr_str += f' enlarge_size={self.enlarge_size},'
-        repr_str += f' min_unique_num={self.min_unique_num},'
-        repr_str += f' eps={self.eps})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class BackgroundPointsFilter(object):
-    """Filter background points near the bounding box.
-
-    Args:
-        bbox_enlarge_range (tuple[float], float): Bbox enlarge range.
-    """
-
-    def __init__(self, bbox_enlarge_range):
-        assert (is_tuple_of(bbox_enlarge_range, float)
-                and len(bbox_enlarge_range) == 3) \
-            or isinstance(bbox_enlarge_range, float), \
-            f'Invalid arguments bbox_enlarge_range {bbox_enlarge_range}'
-
-        if isinstance(bbox_enlarge_range, float):
-            bbox_enlarge_range = [bbox_enlarge_range] * 3
-        self.bbox_enlarge_range = np.array(
-            bbox_enlarge_range, dtype=np.float32)[np.newaxis, :]
-
-    def __call__(self, input_dict):
-        """Call function to filter points by the range.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after filtering, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        points = input_dict['points']
-        gt_bboxes_3d = input_dict['gt_bboxes_3d']
-
-        # avoid groundtruth being modified
-        gt_bboxes_3d_np = gt_bboxes_3d.tensor.clone().numpy()
-        gt_bboxes_3d_np[:, :3] = gt_bboxes_3d.gravity_center.clone().numpy()
-
-        enlarged_gt_bboxes_3d = gt_bboxes_3d_np.copy()
-        enlarged_gt_bboxes_3d[:, 3:6] += self.bbox_enlarge_range
-        points_numpy = points.tensor.clone().numpy()
-        foreground_masks = box_np_ops.points_in_rbbox(
-            points_numpy, gt_bboxes_3d_np, origin=(0.5, 0.5, 0.5))
-        enlarge_foreground_masks = box_np_ops.points_in_rbbox(
-            points_numpy, enlarged_gt_bboxes_3d, origin=(0.5, 0.5, 0.5))
-        foreground_masks = foreground_masks.max(1)
-        enlarge_foreground_masks = enlarge_foreground_masks.max(1)
-        valid_masks = ~np.logical_and(~foreground_masks,
-                                      enlarge_foreground_masks)
-
-        input_dict['points'] = points[valid_masks]
-        pts_instance_mask = input_dict.get('pts_instance_mask', None)
-        if pts_instance_mask is not None:
-            input_dict['pts_instance_mask'] = pts_instance_mask[valid_masks]
-
-        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
-        if pts_semantic_mask is not None:
-            input_dict['pts_semantic_mask'] = pts_semantic_mask[valid_masks]
-        return input_dict
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-        repr_str = self.__class__.__name__
-        repr_str += f'(bbox_enlarge_range={self.bbox_enlarge_range.tolist()})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class VoxelBasedPointSampler(object):
-    """Voxel based point sampler.
-
-    Apply voxel sampling to multiple sweep points.
-
-    Args:
-        cur_sweep_cfg (dict): Config for sampling current points.
-        prev_sweep_cfg (dict): Config for sampling previous points.
-        time_dim (int): Index that indicate the time dimension
-            for input points.
-    """
-
-    def __init__(self, cur_sweep_cfg, prev_sweep_cfg=None, time_dim=3):
-        self.cur_voxel_generator = VoxelGenerator(**cur_sweep_cfg)
-        self.cur_voxel_num = self.cur_voxel_generator._max_voxels
-        self.time_dim = time_dim
-        if prev_sweep_cfg is not None:
-            assert prev_sweep_cfg['max_num_points'] == \
-                cur_sweep_cfg['max_num_points']
-            self.prev_voxel_generator = VoxelGenerator(**prev_sweep_cfg)
-            self.prev_voxel_num = self.prev_voxel_generator._max_voxels
-        else:
-            self.prev_voxel_generator = None
-            self.prev_voxel_num = 0
-
-    def _sample_points(self, points, sampler, point_dim):
-        """Sample points for each points subset.
-
-        Args:
-            points (np.ndarray): Points subset to be sampled.
-            sampler (VoxelGenerator): Voxel based sampler for
-                each points subset.
-            point_dim (int): The dimension of each points
-
-        Returns:
-            np.ndarray: Sampled points.
-        """
-        voxels, coors, num_points_per_voxel = sampler.generate(points)
-        if voxels.shape[0] < sampler._max_voxels:
-            padding_points = np.zeros([
-                sampler._max_voxels - voxels.shape[0], sampler._max_num_points,
-                point_dim
-            ],
-                                      dtype=points.dtype)
-            padding_points[:] = voxels[0]
-            sample_points = np.concatenate([voxels, padding_points], axis=0)
-        else:
-            sample_points = voxels
-
-        return sample_points
-
-    def __call__(self, results):
-        """Call function to sample points from multiple sweeps.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after sampling, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        points = results['points']
-        original_dim = points.shape[1]
-
-        # TODO: process instance and semantic mask while _max_num_points
-        # is larger than 1
-        # Extend points with seg and mask fields
-        map_fields2dim = []
-        start_dim = original_dim
-        points_numpy = points.tensor.numpy()
-        extra_channel = [points_numpy]
-        for idx, key in enumerate(results['pts_mask_fields']):
-            map_fields2dim.append((key, idx + start_dim))
-            extra_channel.append(results[key][..., None])
-
-        start_dim += len(results['pts_mask_fields'])
-        for idx, key in enumerate(results['pts_seg_fields']):
-            map_fields2dim.append((key, idx + start_dim))
-            extra_channel.append(results[key][..., None])
-
-        points_numpy = np.concatenate(extra_channel, axis=-1)
-
-        # Split points into two part, current sweep points and
-        # previous sweeps points.
-        # TODO: support different sampling methods for next sweeps points
-        # and previous sweeps points.
-        cur_points_flag = (points_numpy[:, self.time_dim] == 0)
-        cur_sweep_points = points_numpy[cur_points_flag]
-        prev_sweeps_points = points_numpy[~cur_points_flag]
-        if prev_sweeps_points.shape[0] == 0:
-            prev_sweeps_points = cur_sweep_points
-
-        # Shuffle points before sampling
-        np.random.shuffle(cur_sweep_points)
-        np.random.shuffle(prev_sweeps_points)
-
-        cur_sweep_points = self._sample_points(cur_sweep_points,
-                                               self.cur_voxel_generator,
-                                               points_numpy.shape[1])
-        if self.prev_voxel_generator is not None:
-            prev_sweeps_points = self._sample_points(prev_sweeps_points,
-                                                     self.prev_voxel_generator,
-                                                     points_numpy.shape[1])
-
-            points_numpy = np.concatenate(
-                [cur_sweep_points, prev_sweeps_points], 0)
-        else:
-            points_numpy = cur_sweep_points
-
-        if self.cur_voxel_generator._max_num_points == 1:
-            points_numpy = points_numpy.squeeze(1)
-        results['points'] = points.new_point(points_numpy[..., :original_dim])
-
-        # Restore the corresponding seg and mask fields
-        for key, dim_index in map_fields2dim:
-            results[key] = points_numpy[..., dim_index]
-
-        return results
-
-    def __repr__(self):
-        """str: Return a string that describes the module."""
-
-        def _auto_indent(repr_str, indent):
-            repr_str = repr_str.split('\n')
-            repr_str = [' ' * indent + t + '\n' for t in repr_str]
-            repr_str = ''.join(repr_str)[:-1]
-            return repr_str
-
-        repr_str = self.__class__.__name__
-        indent = 4
-        repr_str += '(\n'
-        repr_str += ' ' * indent + f'num_cur_sweep={self.cur_voxel_num},\n'
-        repr_str += ' ' * indent + f'num_prev_sweep={self.prev_voxel_num},\n'
-        repr_str += ' ' * indent + f'time_dim={self.time_dim},\n'
-        repr_str += ' ' * indent + 'cur_voxel_generator=\n'
-        repr_str += f'{_auto_indent(repr(self.cur_voxel_generator), 8)},\n'
-        repr_str += ' ' * indent + 'prev_voxel_generator=\n'
-        repr_str += f'{_auto_indent(repr(self.prev_voxel_generator), 8)})'
-        return repr_str
-
-
-@PIPELINES.register_module()
-class AffineResize(object):
-    """Get the affine transform matrices to the target size.
-
-    Different from :class:`RandomAffine` in MMDetection, this class can
-    calculate the affine transform matrices while resizing the input image
-    to a fixed size. The affine transform matrices include: 1) matrix
-    transforming original image to the network input image size. 2) matrix
-    transforming original image to the network output feature map size.
-
-    Args:
-        img_scale (tuple): Images scales for resizing.
-        down_ratio (int): The down ratio of feature map.
-            Actually the arg should be >= 1.
-        bbox_clip_border (bool, optional): Whether clip the objects
-            outside the border of the image. Defaults to True.
-    """
-
-    def __init__(self, img_scale, down_ratio, bbox_clip_border=True):
-
-        self.img_scale = img_scale
-        self.down_ratio = down_ratio
-        self.bbox_clip_border = bbox_clip_border
-
-    def __call__(self, results):
-        """Call function to do affine transform to input image and labels.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after affine resize, 'affine_aug', 'trans_mat'
-                keys are added in the result dict.
-        """
-        # The results have gone through RandomShiftScale before AffineResize
-        if 'center' not in results:
-            img = results['img']
-            height, width = img.shape[:2]
-            center = np.array([width / 2, height / 2], dtype=np.float32)
-            size = np.array([width, height], dtype=np.float32)
-            results['affine_aug'] = False
-        else:
-            # The results did not go through RandomShiftScale before
-            # AffineResize
-            img = results['img']
-            center = results['center']
-            size = results['size']
-
-        trans_affine = self._get_transform_matrix(center, size, self.img_scale)
-
-        img = cv2.warpAffine(img, trans_affine[:2, :], self.img_scale)
-
-        if isinstance(self.down_ratio, tuple):
-            trans_mat = [
-                self._get_transform_matrix(
-                    center, size,
-                    (self.img_scale[0] // ratio, self.img_scale[1] // ratio))
-                for ratio in self.down_ratio
-            ]  # (3, 3)
-        else:
-            trans_mat = self._get_transform_matrix(
-                center, size, (self.img_scale[0] // self.down_ratio,
-                               self.img_scale[1] // self.down_ratio))
-
-        results['img'] = img
-        results['img_shape'] = img.shape
-        results['pad_shape'] = img.shape
-        results['trans_mat'] = trans_mat
-
-        self._affine_bboxes(results, trans_affine)
-
-        if 'centers2d' in results:
-            centers2d = self._affine_transform(results['centers2d'],
-                                               trans_affine)
-            valid_index = (centers2d[:, 0] >
-                           0) & (centers2d[:, 0] <
-                                 self.img_scale[0]) & (centers2d[:, 1] > 0) & (
-                                     centers2d[:, 1] < self.img_scale[1])
-            results['centers2d'] = centers2d[valid_index]
-
-            for key in results.get('bbox_fields', []):
-                if key in ['gt_bboxes']:
-                    results[key] = results[key][valid_index]
-                    if 'gt_labels' in results:
-                        results['gt_labels'] = results['gt_labels'][
-                            valid_index]
-                    if 'gt_masks' in results:
-                        raise NotImplementedError(
-                            'AffineResize only supports bbox.')
-
-            for key in results.get('bbox3d_fields', []):
-                if key in ['gt_bboxes_3d']:
-                    results[key].tensor = results[key].tensor[valid_index]
-                    if 'gt_labels_3d' in results:
-                        results['gt_labels_3d'] = results['gt_labels_3d'][
-                            valid_index]
-
-            results['depths'] = results['depths'][valid_index]
-
-        return results
-
-    def _affine_bboxes(self, results, matrix):
-        """Affine transform bboxes to input image.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-            matrix (np.ndarray): Matrix transforming original
-                image to the network input image size.
-                shape: (3, 3)
-        """
-
-        for key in results.get('bbox_fields', []):
-            bboxes = results[key]
-            bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix)
-            bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix)
-            if self.bbox_clip_border:
-                bboxes[:,
-                       [0, 2]] = bboxes[:,
-                                        [0, 2]].clip(0, self.img_scale[0] - 1)
-                bboxes[:,
-                       [1, 3]] = bboxes[:,
-                                        [1, 3]].clip(0, self.img_scale[1] - 1)
-            results[key] = bboxes
-
-    def _affine_transform(self, points, matrix):
-        """Affine transform bbox points to input image.
-
-        Args:
-            points (np.ndarray): Points to be transformed.
-                shape: (N, 2)
-            matrix (np.ndarray): Affine transform matrix.
-                shape: (3, 3)
-
-        Returns:
-            np.ndarray: Transformed points.
-        """
-        num_points = points.shape[0]
-        hom_points_2d = np.concatenate((points, np.ones((num_points, 1))),
-                                       axis=1)
-        hom_points_2d = hom_points_2d.T
-        affined_points = np.matmul(matrix, hom_points_2d).T
-        return affined_points[:, :2]
-
-    def _get_transform_matrix(self, center, scale, output_scale):
-        """Get affine transform matrix.
-
-        Args:
-            center (tuple): Center of current image.
-            scale (tuple): Scale of current image.
-            output_scale (tuple[float]): The transform target image scales.
-
-        Returns:
-            np.ndarray: Affine transform matrix.
-        """
-        # TODO: further add rot and shift here.
-        src_w = scale[0]
-        dst_w = output_scale[0]
-        dst_h = output_scale[1]
-
-        src_dir = np.array([0, src_w * -0.5])
-        dst_dir = np.array([0, dst_w * -0.5])
-
-        src = np.zeros((3, 2), dtype=np.float32)
-        dst = np.zeros((3, 2), dtype=np.float32)
-        src[0, :] = center
-        src[1, :] = center + src_dir
-        dst[0, :] = np.array([dst_w * 0.5, dst_h * 0.5])
-        dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
-
-        src[2, :] = self._get_ref_point(src[0, :], src[1, :])
-        dst[2, :] = self._get_ref_point(dst[0, :], dst[1, :])
-
-        get_matrix = cv2.getAffineTransform(src, dst)
-
-        matrix = np.concatenate((get_matrix, [[0., 0., 1.]]))
-
-        return matrix.astype(np.float32)
-
-    def _get_ref_point(self, ref_point1, ref_point2):
-        """Get reference point to calculate affine transform matrix.
-
-        While using opencv to calculate the affine matrix, we need at least
-        three corresponding points separately on original image and target
-        image. Here we use two points to get the the third reference point.
-        """
-        d = ref_point1 - ref_point2
-        ref_point3 = ref_point2 + np.array([-d[1], d[0]])
-        return ref_point3
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(img_scale={self.img_scale}, '
-        repr_str += f'down_ratio={self.down_ratio}) '
-        return repr_str
-
-
-@PIPELINES.register_module()
-class RandomShiftScale(object):
-    """Random shift scale.
-
-    Different from the normal shift and scale function, it doesn't
-    directly shift or scale image. It can record the shift and scale
-    infos into loading pipelines. It's designed to be used with
-    AffineResize together.
-
-    Args:
-        shift_scale (tuple[float]): Shift and scale range.
-        aug_prob (float): The shifting and scaling probability.
-    """
-
-    def __init__(self, shift_scale, aug_prob):
-
-        self.shift_scale = shift_scale
-        self.aug_prob = aug_prob
-
-    def __call__(self, results):
-        """Call function to record random shift and scale infos.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Results after random shift and scale, 'center', 'size'
-                and 'affine_aug' keys are added in the result dict.
-        """
-        img = results['img']
-
-        height, width = img.shape[:2]
-
-        center = np.array([width / 2, height / 2], dtype=np.float32)
-        size = np.array([width, height], dtype=np.float32)
-
-        if random.random() < self.aug_prob:
-            shift, scale = self.shift_scale[0], self.shift_scale[1]
-            shift_ranges = np.arange(-shift, shift + 0.1, 0.1)
-            center[0] += size[0] * random.choice(shift_ranges)
-            center[1] += size[1] * random.choice(shift_ranges)
-            scale_ranges = np.arange(1 - scale, 1 + scale + 0.1, 0.1)
-            size *= random.choice(scale_ranges)
-            results['affine_aug'] = True
-        else:
-            results['affine_aug'] = False
-
-        results['center'] = center
-        results['size'] = size
-
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(shift_scale={self.shift_scale}, '
-        repr_str += f'aug_prob={self.aug_prob}) '
-        return repr_str
-
-@PIPELINES.register_module()
-class DataFilter(object):
-    """Point sample.
-
-    Sampling data to a certain number.
-
-    Args:
-        num_points (int): Number of points to be sampled.
-        sample_range (float, optional): The range where to sample points.
-            If not None, the points with depth larger than `sample_range` are
-            prior to be sampled. Defaults to None.
-        replace (bool, optional): Whether the sampling is with or without
-            replacement. Defaults to False.
-    """
-
-    def __init__(self, method='remove_statistical_outlier', params={"nb_neighbors" : 20, "std_ratio" : 0.05}):
-        self.method = method
-        self.params = params
-
-    def remove_statistical_outlier(self, lidar_data, nb_neighbors, std_ratio):
-
-        points = lidar_data['points'].tensor.numpy()
-
-        pcd = o3d.geometry.PointCloud()
-        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
-        filtered_pcd, ind = pcd.remove_statistical_outlier(nb_neighbors, std_ratio)
-
-        filterer_points = np.asarray(filtered_pcd.points)
-        if (points.shape[1] > 3):
-            filterer_points = np.concatenate((filterer_points, points[ind, 3:]), axis=-1)
-
-        filtered_lidar_data = lidar_data
-        filtered_lidar_data["points"] = LiDARPoints(filterer_points.astype(np.float32), points_dim=filterer_points.shape[-1], attribute_dims=None)
-
-        return filtered_lidar_data
-
-    def remove_radius_outlier(self, lidar_data, nb_points, radius):
-
-        points = lidar_data['points'].tensor.numpy()
-
-        pcd = o3d.geometry.PointCloud()
-        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
-        filtered_pcd, ind = pcd.remove_radius_outlier(nb_points, radius)
-
-        filterer_points = np.asarray(filtered_pcd.points)
-        if (points.shape[1] > 3):
-            filterer_points = np.concatenate((filterer_points, points[ind, 3:]), axis=-1)
-
-        filtered_lidar_data = lidar_data
-        filtered_lidar_data["points"] = LiDARPoints(filterer_points.astype(np.float32), points_dim=filterer_points.shape[-1], attribute_dims=None)
-
-        return filtered_lidar_data
-
-    def voxel_down_sample(self, lidar_data, voxel_size):
-
-        points = lidar_data['points'].tensor.numpy()
-
-        pcd = o3d.geometry.PointCloud()
-        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
-        # filtered_pcd, ind = pcd.voxel_down_sample(voxel_size)
-        filtered_pcd = pcd.voxel_down_sample(voxel_size)
-
-        filterer_points = np.asarray(filtered_pcd.points)
-
-        filtered_lidar_data = lidar_data
-        filtered_lidar_data["points"] = LiDARPoints(filterer_points.astype(np.float32), points_dim=filterer_points.shape[-1], attribute_dims=None)
-        return filtered_lidar_data
-
-
-    def __call__(self, results):
-        """Call function to sample points to in indoor scenes.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Results after sampling, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        if self.method == 'remove_statistical_outlier':
-            self.remove_statistical_outlier(results, **self.params)
-        elif self.method == 'remove_radius_outlier':
-            self.remove_radius_outlier(results, **self.params)
-        elif self.method == 'voxel_down_sample':
-            self.voxel_down_sample(results, **self.params)
-
-        return results
-
-@PIPELINES.register_module()
-class DataAugmentor(object):
-    """Point sample.
-
-    Sampling data to a certain number.
-
-    Args:
-        num_points (int): Number of points to be sampled.
-        sample_range (float, optional): The range where to sample points.
-            If not None, the points with depth larger than `sample_range` are
-            prior to be sampled. Defaults to None.
-        replace (bool, optional): Whether the sampling is with or without
-            replacement. Defaults to False.
-    """
-
-    def __init__(self, method, params=None):
-        self.method = method
-        self.params = params
-
-    def random_flip_along_x(self, results):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        enable = np.random.choice([False, True], replace=False, p=[0.5, 0.5])
-
-        if enable:
-
-            gt_boxes[:, 1] = -gt_boxes[:, 1]
-            gt_boxes[:, 6] = -gt_boxes[:, 6]
-            points[:, 1] = -points[:, 1]
-
-            if gt_boxes.shape[1] > 7:
-                gt_boxes[:, 8] = -gt_boxes[:, 8]
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def random_flip_along_y(self, results):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        enable = np.random.choice([False, True], replace=False, p=[0.5, 0.5])
-
-        if enable:
-            gt_boxes[:, 0] = -gt_boxes[:, 0]
-            gt_boxes[:, 6] = -(gt_boxes[:, 6] + np.pi)
-            # points[:, 0] = -points[:, 0]
-
-            if gt_boxes.shape[1] > 7:
-                gt_boxes[:, 7] = -gt_boxes[:, 7]
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def global_scaling(self, results, scale_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        if scale_range[1] - scale_range[0] < 1e-3:
-            return results
-        noise_scale = np.random.uniform(scale_range[0], scale_range[1])
-        points[:, :3] *= noise_scale
-        gt_boxes[:, :6] *= noise_scale
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def random_translation_along_x(self, results, offset_std):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        offset = np.random.normal(0, offset_std, 1)
-
-        points[:, 0] += offset
-        gt_boxes[:, 0] += offset
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def random_translation_along_y(self, results, offset_std):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        offset = np.random.normal(0, offset_std, 1)
-
-        points[:, 1] += offset
-        gt_boxes[:, 1] += offset
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def random_translation_along_z(self, results, offset_std):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        offset = np.random.normal(0, offset_std, 1)
-
-        points[:, 2] += offset
-        gt_boxes[:, 2] += offset
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def global_frustum_dropout_top(self, results, intensity_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        intensity = np.random.uniform(intensity_range[0], intensity_range[1])
-        # threshold = max - length * uniform(0 ~ 0.2)
-        threshold = np.max(points[:, 2]) - intensity * (np.max(points[:, 2]) - np.min(points[:, 2]))
-
-        points = points[points[:, 2] < threshold]
-        gt_boxes = gt_boxes[gt_boxes[:, 2] < threshold]
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def global_frustum_dropout_bottom(self, results, intensity_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        intensity = np.random.uniform(intensity_range[0], intensity_range[1])
-
-        threshold = np.min(points[:, 2]) + intensity * (np.max(points[:, 2]) - np.min(points[:, 2]))
-        points = points[points[:, 2] > threshold]
-        gt_boxes = gt_boxes[gt_boxes[:, 2] > threshold]
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def global_frustum_dropout_left(self, results, intensity_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        intensity = np.random.uniform(intensity_range[0], intensity_range[1])
-
-        threshold = np.max(points[:, 1]) - intensity * (np.max(points[:, 1]) - np.min(points[:, 1]))
-        points = points[points[:, 1] < threshold]
-        gt_boxes = gt_boxes[gt_boxes[:, 1] < threshold]
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def global_frustum_dropout_right(self, results, intensity_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        intensity = np.random.uniform(intensity_range[0], intensity_range[1])
-
-        threshold = np.min(points[:, 1]) + intensity * (np.max(points[:, 1]) - np.min(points[:, 1]))
-        points = points[points[:, 1] > threshold]
-        gt_boxes = gt_boxes[gt_boxes[:, 1] > threshold]
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def local_scaling(self, results, scale_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        if scale_range[1] - scale_range[0] < 1e-3:
-            return results
-
-        # augs = {}
-        for idx, box in enumerate(gt_boxes):
-            noise_scale = np.random.uniform(scale_range[0], scale_range[1])
-            # augs[f'object_{idx}'] = noise_scale
-            points_in_box, mask = get_points_in_box(points, box)
-
-            # tranlation to axis center
-            points[mask, 0] -= box[0]
-            points[mask, 1] -= box[1]
-            points[mask, 2] -= box[2]
-
-            # apply scaling
-            points[mask, :3] *= noise_scale
-
-            # tranlation back to original position
-            points[mask, 0] += box[0]
-            points[mask, 1] += box[1]
-            points[mask, 2] += box[2]
-
-            gt_boxes[idx, 3:6] *= noise_scale
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def random_local_translation_along_x(self, results, offset_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        for idx, box in enumerate(gt_boxes):
-            offset = np.random.uniform(offset_range[0], offset_range[1])
-            # augs[f'object_{idx}'] = offset
-            points_in_box, mask = get_points_in_box(points, box)
-            points[mask, 0] += offset
-
-            gt_boxes[idx, 0] += offset
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def random_local_translation_along_y(self, results, offset_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        for idx, box in enumerate(gt_boxes):
-            offset = np.random.uniform(offset_range[0], offset_range[1])
-            # augs[f'object_{idx}'] = offset
-            points_in_box, mask = get_points_in_box(points, box)
-            points[mask, 1] += offset
-
-            gt_boxes[idx, 1] += offset
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def random_local_translation_along_z(self, results, offset_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        for idx, box in enumerate(gt_boxes):
-            offset = np.random.uniform(offset_range[0], offset_range[1])
-            # augs[f'object_{idx}'] = offset
-            points_in_box, mask = get_points_in_box(points, box)
-            points[mask, 2] += offset
-
-            gt_boxes[idx, 2] += offset
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def local_frustum_dropout_top(self, results, intensity_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        for idx, box in enumerate(gt_boxes):
-            x, y, z, dx, dy, dz = box[0], box[1], box[2], box[3], box[4], box[5]
-
-            intensity = np.random.uniform(intensity_range[0], intensity_range[1])
-            points_in_box, mask = get_points_in_box(points, box)
-            threshold = (z + dz / 2) - intensity * dz
-
-            points = points[np.logical_not(np.logical_and(mask, points[:, 2] >= threshold))]
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def local_frustum_dropout_bottle(self, results, intensity_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        for idx, box in enumerate(gt_boxes):
-            x, y, z, dx, dy, dz = box[0], box[1], box[2], box[3], box[4], box[5]
-
-            intensity = np.random.uniform(intensity_range[0], intensity_range[1])
-            points_in_box, mask = get_points_in_box(points, box)
-            threshold = (z - dz / 2) + intensity * dz
-
-            points = points[np.logical_not(np.logical_and(mask, points[:, 2] <= threshold))]
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def local_frustum_dropout_left(self, results, intensity_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        for idx, box in enumerate(gt_boxes):
-            x, y, z, dx, dy, dz = box[0], box[1], box[2], box[3], box[4], box[5]
-
-            intensity = np.random.uniform(intensity_range[0], intensity_range[1])
-            points_in_box, mask = get_points_in_box(points, box)
-            threshold = (y + dy / 2) - intensity * dy
-
-            points = points[np.logical_not(np.logical_and(mask, points[:, 1] >= threshold))]
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def local_frustum_dropout_right(self, results, intensity_range):
-
-        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
-        points = results['points'].tensor.numpy()
-        rect = results['ann_info']['rect']
-        Trv2c = results['ann_info']['Trv2c']
-
-        for idx, box in enumerate(gt_boxes):
-            x, y, z, dx, dy, dz = box[0], box[1], box[2], box[3], box[4], box[5]
-
-            intensity = np.random.uniform(intensity_range[0], intensity_range[1])
-            points_in_box, mask = get_points_in_box(points, box)
-            threshold = (y - dy / 2) + intensity * dy
-
-            points = points[np.logical_not(np.logical_and(mask, points[:, 1] <= threshold))]
-
-        results["points"] = LiDARPoints(points.astype(np.float32),
-                                                    points_dim=points.shape[-1], attribute_dims=None)
-        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
-            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
-        return results
-
-    def __call__(self, results):
-        """Call function to sample points to in indoor scenes.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Results after sampling, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-
-        if self.method == 'random_flip_along_x':
-            results = self.random_flip_along_x(results)
-        elif self.method == 'random_flip_along_y':
-            results = self.random_flip_along_y(results)
-
-        # elif self.method == 'global_rotation':
-        #     gt_boxes, points = self.global_rotation(results, **self.params)
-        elif self.method == 'global_scaling':
-            results = self.global_scaling(results, self.params)
-
-        elif self.method == 'random_translation_along_x':
-            results = self.random_translation_along_x(results, self.params)
-        elif self.method == 'random_translation_along_y':
-            results = self.random_translation_along_y(results, self.params)
-        elif self.method == 'random_translation_along_z':
-            results = self.random_translation_along_z(results, self.params)
-
-        elif self.method == 'global_frustum_dropout_top':
-            results = self.global_frustum_dropout_top(results, self.params)
-        elif self.method == 'global_frustum_dropout_bottom':
-            results = self.global_frustum_dropout_bottom(results, self.params)
-        elif self.method == 'global_frustum_dropout_left':
-            results = self.global_frustum_dropout_left(results, self.params)
-        elif self.method == 'global_frustum_dropout_right':
-            results = self.global_frustum_dropout_right(results, self.params)
-
-        elif self.method == 'local_scaling':
-            results = self.local_scaling(results, self.params)
-        # elif self.method == 'local_rotation':
-        #     results = self.local_rotation(results, self.params)
-
-        elif self.method == 'random_local_translation_along_x':
-            results = self.random_local_translation_along_x(results, self.params)
-        elif self.method == 'random_local_translation_along_y':
-            results = self.random_local_translation_along_y(results, self.params)
-        elif self.method == 'random_local_translation_along_z':
-            results = self.random_local_translation_along_z(results, self.params)
-
-        elif self.method == 'local_frustum_dropout_top':
-            results = self.local_frustum_dropout_top(results, self.params)
-        elif self.method == 'local_frustum_dropout_bottom':
-            results = self.local_frustum_dropout_bottom(results, self.params)
-        elif self.method == 'local_frustum_dropout_left':
-            results = self.local_frustum_dropout_left(results, self.params)
-        elif self.method == 'local_frustum_dropout_right':
-            results = self.local_frustum_dropout_right(results, self.params)
-
-        return results
-
-@PIPELINES.register_module()
-class DataDenoisor(object):
-    """Point sample.
-
-    Sampling data to a certain number.
-
-    Args:
-        num_points (int): Number of points to be sampled.
-        sample_range (float, optional): The range where to sample points.
-            If not None, the points with depth larger than `sample_range` are
-            prior to be sampled. Defaults to None.
-        replace (bool, optional): Whether the sampling is with or without
-            replacement. Defaults to False.
-    """
-
-    def __init__(self, method):
-        self.method = method
-
-    def __call__(self, results):
-        """Call function to sample points to in indoor scenes.
-
-        Args:
-            input_dict (dict): Result dict from loading pipeline.
-        Returns:
-            dict: Results after sampling, 'points', 'pts_instance_mask'
-                and 'pts_semantic_mask' keys are updated in the result dict.
-        """
-        points = results['points'].tensor.numpy()
-        points = torch.tensor(points[:, :3])
-
-        if self.method == 'pcp':
-            param_filename = 'deephub/denoisy_model/pcp/pretrained/denoisingModel/PointCleanNet_params.pth'
-            model_filename = 'deephub/denoisy_model/pcp/pretrained/denoisingModel/PointCleanNet_model.pth'
-            trainopt = torch.load(param_filename)
-            pred_dim = 0
-            output_pred_ind = []
-            for o in trainopt.outputs:
-                if o in ['clean_points']:
-                    output_pred_ind.append(pred_dim)
-                    pred_dim += 3
-                else:
-                    raise ValueError('Unknown output: %s' % (o))
-
-            regressor = ResPCPNet(
-                num_points=trainopt.points_per_patch,
-                output_dim=pred_dim,
-                use_point_stn=trainopt.use_point_stn,
-                use_feat_stn=trainopt.use_feat_stn,
-                sym_op=trainopt.sym_op,
-                point_tuple=trainopt.point_tuple)
-            state_dict = torch.load(model_filename,map_location='cpu')
-            regressor.load_state_dict(state_dict)
-
-            pred, trans, _, _ = regressor(points)
-            patch_radiuses = torch.FloatTensor([0.05])
-
-            denoised = pred
-        elif self.method == 'dmr':
-            num_points = points.shape[0]
-            if num_points >= 120000:
-                print('[INFO] Denoising large point cloud.')
-                denoised, downsampled = run_denoise_large_pointcloud(
-                    pc=points,
-                    cluster_size=30000,
-                    patch_size=1000,
-                    ckpt='deephub/denoisy_model/dmr/pretrained/supervised/epoch=153.ckpt',
-                    device='cuda:0',
-                    random_state=0,
-                    expand_knn=16
-                )
-            elif num_points >= 60000:
-                print('[INFO] Denoising middle-sized point cloud.')
-                denoised, downsampled = run_denoise_middle_pointcloud(
-                    pc=points,
-                    num_splits=2,
-                    patch_size=1000,
-                    ckpt='deephub/denoisy_model/dmr/pretrained/supervised/epoch=153.ckpt',
-                    device='cuda:0',
-                    random_state=0,
-                    expand_knn=16
-                )
-            elif num_points >= 10000:
-                print('[INFO] Denoising regular-sized point cloud.')
-                denoised, downsampled = run_denoise(
-                    pc=points,
-                    patch_size=1000,
-                    ckpt='deephub/denoisy_model/dmr/pretrained/supervised/epoch=153.ckpt',
-                    device='cuda:0',
-                    random_state=0,
-                    expand_knn=16
-                )
-            else:
-                assert False, "Our pretrained model does not support point clouds with less than 10K points."
-        results["points"] = LiDARPoints(denoised.astype(np.float32),
-                                        points_dim=denoised.shape[-1], attribute_dims=None)
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+import warnings
+
+import cv2
+import numpy as np
+import torch
+from mmcv import is_tuple_of
+from mmcv.utils import build_from_cfg
+
+from mmdet3d.core import VoxelGenerator
+from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes,
+                               LiDARInstance3DBoxes, box_np_ops)
+from mmdet3d.core.bbox.structures import Box3DMode
+from mmdet3d.core.points.lidar_points import LiDARPoints
+from mmdet3d.datasets.pipelines.compose import Compose
+from mmdet.datasets.pipelines import RandomCrop, RandomFlip, Rotate
+from ..builder import OBJECTSAMPLERS, PIPELINES
+from .data_augment_utils import noise_per_object_v3_
+import open3d as o3d
+from data.data_augmentor.augmentation_utils import get_points_in_box
+from data.data_denoisor.denoisor_pcp_utils import ResPCPNet
+from data.data_denoisor.denoisor_dmr_utils import run_denoise_large_pointcloud, run_denoise_middle_pointcloud, run_denoise
+
+@PIPELINES.register_module()
+class RandomDropPointsColor(object):
+    r"""Randomly set the color of points to all zeros.
+
+    Once this transform is executed, all the points' color will be dropped.
+    Refer to `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/
+    util/transform.py#L223>`_ for more details.
+
+    Args:
+        drop_ratio (float, optional): The probability of dropping point colors.
+            Defaults to 0.2.
+    """
+
+    def __init__(self, drop_ratio=0.2):
+        assert isinstance(drop_ratio, (int, float)) and 0 <= drop_ratio <= 1, \
+            f'invalid drop_ratio value {drop_ratio}'
+        self.drop_ratio = drop_ratio
+
+    def __call__(self, input_dict):
+        """Call function to drop point colors.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after color dropping,
+                'points' key is updated in the result dict.
+        """
+        points = input_dict['points']
+        assert points.attribute_dims is not None and \
+            'color' in points.attribute_dims, \
+            'Expect points have color attribute'
+
+        # this if-expression is a bit strange
+        # `RandomDropPointsColor` is used in training 3D segmentor PAConv
+        # we discovered in our experiments that, using
+        # `if np.random.rand() > 1.0 - self.drop_ratio` consistently leads to
+        # better results than using `if np.random.rand() < self.drop_ratio`
+        # so we keep this hack in our codebase
+        if np.random.rand() > 1.0 - self.drop_ratio:
+            points.color = points.color * 0.0
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(drop_ratio={self.drop_ratio})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlip3D(RandomFlip):
+    """Flip the points & bbox.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    Args:
+        sync_2d (bool, optional): Whether to apply flip according to the 2D
+            images. If True, it will apply the same flip as that to 2D images.
+            If False, it will decide whether to flip randomly and independently
+            to that of 2D images. Defaults to True.
+        flip_ratio_bev_horizontal (float, optional): The flipping probability
+            in horizontal direction. Defaults to 0.0.
+        flip_ratio_bev_vertical (float, optional): The flipping probability
+            in vertical direction. Defaults to 0.0.
+    """
+
+    def __init__(self,
+                 sync_2d=True,
+                 flip_ratio_bev_horizontal=0.0,
+                 flip_ratio_bev_vertical=0.0,
+                 **kwargs):
+        super(RandomFlip3D, self).__init__(
+            flip_ratio=flip_ratio_bev_horizontal, **kwargs)
+        self.sync_2d = sync_2d
+        self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
+        if flip_ratio_bev_horizontal is not None:
+            assert isinstance(
+                flip_ratio_bev_horizontal,
+                (int, float)) and 0 <= flip_ratio_bev_horizontal <= 1
+        if flip_ratio_bev_vertical is not None:
+            assert isinstance(
+                flip_ratio_bev_vertical,
+                (int, float)) and 0 <= flip_ratio_bev_vertical <= 1
+
+    def random_flip_data_3d(self, input_dict, direction='horizontal'):
+        """Flip 3D data randomly.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            direction (str, optional): Flip direction.
+                Default: 'horizontal'.
+
+        Returns:
+            dict: Flipped results, 'points', 'bbox3d_fields' keys are
+                updated in the result dict.
+        """
+        assert direction in ['horizontal', 'vertical']
+        # for semantic segmentation task, only points will be flipped.
+        if 'bbox3d_fields' not in input_dict:
+            input_dict['points'].flip(direction)
+            return
+        if len(input_dict['bbox3d_fields']) == 0:  # test mode
+            input_dict['bbox3d_fields'].append('empty_box3d')
+            input_dict['empty_box3d'] = input_dict['box_type_3d'](
+                np.array([], dtype=np.float32))
+        assert len(input_dict['bbox3d_fields']) == 1
+        for key in input_dict['bbox3d_fields']:
+            if 'points' in input_dict:
+                input_dict['points'] = input_dict[key].flip(
+                    direction, points=input_dict['points'])
+            else:
+                input_dict[key].flip(direction)
+        if 'centers2d' in input_dict:
+            assert self.sync_2d is True and direction == 'horizontal', \
+                'Only support sync_2d=True and horizontal flip with images'
+            w = input_dict['ori_shape'][1]
+            input_dict['centers2d'][..., 0] = \
+                w - input_dict['centers2d'][..., 0]
+            # need to modify the horizontal position of camera center
+            # along u-axis in the image (flip like centers2d)
+            # ['cam2img'][0][2] = c_u
+            # see more details and examples at
+            # https://github.com/open-mmlab/mmdetection3d/pull/744
+            input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2]
+
+    def __call__(self, input_dict):
+        """Call function to flip points, values in the ``bbox3d_fields`` and
+        also flip 2D image and its annotations.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip', 'flip_direction',
+                'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added
+                into result dict.
+        """
+        # flip 2D image and its annotations
+        super(RandomFlip3D, self).__call__(input_dict)
+
+        if self.sync_2d:
+            input_dict['pcd_horizontal_flip'] = input_dict['flip']
+            input_dict['pcd_vertical_flip'] = False
+        else:
+            if 'pcd_horizontal_flip' not in input_dict:
+                flip_horizontal = True if np.random.rand(
+                ) < self.flip_ratio else False
+                input_dict['pcd_horizontal_flip'] = flip_horizontal
+            if 'pcd_vertical_flip' not in input_dict:
+                flip_vertical = True if np.random.rand(
+                ) < self.flip_ratio_bev_vertical else False
+                input_dict['pcd_vertical_flip'] = flip_vertical
+
+        if 'transformation_3d_flow' not in input_dict:
+            input_dict['transformation_3d_flow'] = []
+
+        if input_dict['pcd_horizontal_flip']:
+            self.random_flip_data_3d(input_dict, 'horizontal')
+            input_dict['transformation_3d_flow'].extend(['HF'])
+        if input_dict['pcd_vertical_flip']:
+            self.random_flip_data_3d(input_dict, 'vertical')
+            input_dict['transformation_3d_flow'].extend(['VF'])
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(sync_2d={self.sync_2d},'
+        repr_str += f' flip_ratio_bev_vertical={self.flip_ratio_bev_vertical})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class MultiViewWrapper(object):
+    """Wrap transformation from single-view into multi-view.
+
+    The wrapper processes the images from multi-view one by one. For each
+    image, it constructs a pseudo dict according to the keys specified by the
+    'process_fields' parameter. After the transformation is finished, desired
+    information can be collected by specifying the keys in the 'collected_keys'
+    parameter. Multi-view images share the same transformation parameters
+    but do not share the same magnitude when a random transformation is
+    conducted.
+
+    Args:
+        transforms (list[dict]): A list of dict specifying the transformations
+            for the monocular situation.
+        process_fields (dict): Desired keys that the transformations should
+            be conducted on. Default to dict(img_fields=['img']).
+        collected_keys (list[str]): Collect information in transformation
+            like rotate angles, crop roi, and flip state.
+    """
+
+    def __init__(self,
+                 transforms,
+                 process_fields=dict(img_fields=['img']),
+                 collected_keys=[]):
+        self.transform = Compose(transforms)
+        self.collected_keys = collected_keys
+        self.process_fields = process_fields
+
+    def __call__(self, input_dict):
+        for key in self.collected_keys:
+            input_dict[key] = []
+        for img_id in range(len(input_dict['img'])):
+            process_dict = self.process_fields.copy()
+            for field in self.process_fields:
+                for key in self.process_fields[field]:
+                    process_dict[key] = input_dict[key][img_id]
+            process_dict = self.transform(process_dict)
+            for field in self.process_fields:
+                for key in self.process_fields[field]:
+                    input_dict[key][img_id] = process_dict[key]
+            for key in self.collected_keys:
+                input_dict[key].append(process_dict[key])
+        return input_dict
+
+
+@PIPELINES.register_module()
+class RangeLimitedRandomCrop(RandomCrop):
+    """Randomly crop image-view objects under a limitation of range.
+
+    Args:
+        relative_x_offset_range (tuple[float]): Relative range of random crop
+            in x direction. (x_min, x_max) in [0, 1.0]. Default to (0.0, 1.0).
+        relative_y_offset_range (tuple[float]): Relative range of random crop
+            in y direction. (y_min, y_max) in [0, 1.0]. Default to (0.0, 1.0).
+    """
+
+    def __init__(self,
+                 relative_x_offset_range=(0.0, 1.0),
+                 relative_y_offset_range=(0.0, 1.0),
+                 **kwargs):
+        super(RangeLimitedRandomCrop, self).__init__(**kwargs)
+        for range in [relative_x_offset_range, relative_y_offset_range]:
+            assert 0 <= range[0] <= range[1] <= 1
+        self.relative_x_offset_range = relative_x_offset_range
+        self.relative_y_offset_range = relative_y_offset_range
+
+    def _crop_data(self, results, crop_size, allow_negative_crop):
+        """Function to randomly crop images.
+
+        Modified from RandomCrop in mmdet==2.25.0
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_size (tuple): Expected absolute size after cropping, (h, w).
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            margin_h = max(img.shape[0] - crop_size[0], 0)
+            margin_w = max(img.shape[1] - crop_size[1], 0)
+            offset_range_h = (margin_h * self.relative_y_offset_range[0],
+                              margin_h * self.relative_y_offset_range[1] + 1)
+            offset_h = np.random.randint(*offset_range_h)
+            offset_range_w = (margin_w * self.relative_x_offset_range[0],
+                              margin_w * self.relative_x_offset_range[1] + 1)
+            offset_w = np.random.randint(*offset_range_w)
+            crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+            crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+
+            # crop the image
+            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+            img_shape = img.shape
+            results[key] = img
+            results['crop'] = (crop_x1, crop_y1, crop_x2, crop_y2)
+        results['img_shape'] = img_shape
+
+        # crop bboxes accordingly and clip to the image boundary
+        for key in results.get('bbox_fields', []):
+            # e.g. gt_bboxes and gt_bboxes_ignore
+            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
+                                   dtype=np.float32)
+            bboxes = results[key] - bbox_offset
+            if self.bbox_clip_border:
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
+                bboxes[:, 3] > bboxes[:, 1])
+            # If the crop does not contain any gt-bbox area and
+            # allow_negative_crop is False, skip this image.
+            if (key == 'gt_bboxes' and not valid_inds.any()
+                    and not allow_negative_crop):
+                return None
+            results[key] = bboxes[valid_inds, :]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = self.bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = self.bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+                if self.recompute_bbox:
+                    results[key] = results[mask_key].get_bboxes()
+
+        # crop semantic seg
+        for key in results.get('seg_fields', []):
+            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
+
+        return results
+
+
+@PIPELINES.register_module()
+class RandomRotate(Rotate):
+    """Randomly rotate images.
+
+    The ratation angle is selected uniformly within the interval specified by
+    the 'range'  parameter.
+
+    Args:
+        range (tuple[float]): Define the range of random rotation.
+            (angle_min, angle_max) in angle.
+    """
+
+    def __init__(self, range, **kwargs):
+        super(RandomRotate, self).__init__(**kwargs)
+        self.range = range
+
+    def __call__(self, results):
+        self.angle = np.random.uniform(self.range[0], self.range[1])
+        super(RandomRotate, self).__call__(results)
+        results['rotate'] = self.angle
+        return results
+
+
+@PIPELINES.register_module()
+class RandomJitterPoints(object):
+    """Randomly jitter point coordinates.
+
+    Different from the global translation in ``GlobalRotScaleTrans``, here we
+        apply different noises to each point in a scene.
+
+    Args:
+        jitter_std (list[float]): The standard deviation of jittering noise.
+            This applies random noise to all points in a 3D scene, which is
+            sampled from a gaussian distribution whose standard deviation is
+            set by ``jitter_std``. Defaults to [0.01, 0.01, 0.01]
+        clip_range (list[float]): Clip the randomly generated jitter
+            noise into this range. If None is given, don't perform clipping.
+            Defaults to [-0.05, 0.05]
+
+    Note:
+        This transform should only be used in point cloud segmentation tasks
+            because we don't transform ground-truth bboxes accordingly.
+        For similar transform in detection task, please refer to `ObjectNoise`.
+    """
+
+    def __init__(self,
+                 jitter_std=[0.01, 0.01, 0.01],
+                 clip_range=[-0.05, 0.05]):
+        seq_types = (list, tuple, np.ndarray)
+        if not isinstance(jitter_std, seq_types):
+            assert isinstance(jitter_std, (int, float)), \
+                f'unsupported jitter_std type {type(jitter_std)}'
+            jitter_std = [jitter_std, jitter_std, jitter_std]
+        self.jitter_std = jitter_std
+
+        if clip_range is not None:
+            if not isinstance(clip_range, seq_types):
+                assert isinstance(clip_range, (int, float)), \
+                    f'unsupported clip_range type {type(clip_range)}'
+                clip_range = [-clip_range, clip_range]
+        self.clip_range = clip_range
+
+    def __call__(self, input_dict):
+        """Call function to jitter all the points in the scene.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after adding noise to each point,
+                'points' key is updated in the result dict.
+        """
+        points = input_dict['points']
+        jitter_std = np.array(self.jitter_std, dtype=np.float32)
+        jitter_noise = \
+            np.random.randn(points.shape[0], 3) * jitter_std[None, :]
+        if self.clip_range is not None:
+            jitter_noise = np.clip(jitter_noise, self.clip_range[0],
+                                   self.clip_range[1])
+
+        points.translate(jitter_noise)
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(jitter_std={self.jitter_std},'
+        repr_str += f' clip_range={self.clip_range})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectSample(object):
+    """Sample GT objects to the data.
+
+    Args:
+        db_sampler (dict): Config dict of the database sampler.
+        sample_2d (bool): Whether to also paste 2D image patch to the images
+            This should be true when applying multi-modality cut-and-paste.
+            Defaults to False.
+        use_ground_plane (bool): Whether to use gound plane to adjust the
+            3D labels.
+    """
+
+    def __init__(self, db_sampler, sample_2d=False, use_ground_plane=False):
+        self.sampler_cfg = db_sampler
+        self.sample_2d = sample_2d
+        if 'type' not in db_sampler.keys():
+            db_sampler['type'] = 'DataBaseSampler'
+        self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS)
+        self.use_ground_plane = use_ground_plane
+
+    @staticmethod
+    def remove_points_in_boxes(points, boxes):
+        """Remove the points in the sampled bounding boxes.
+
+        Args:
+            points (:obj:`BasePoints`): Input point cloud array.
+            boxes (np.ndarray): Sampled ground truth boxes.
+
+        Returns:
+            np.ndarray: Points with those in the boxes removed.
+        """
+        masks = box_np_ops.points_in_rbbox(points.coord.numpy(), boxes)
+        points = points[np.logical_not(masks.any(-1))]
+        return points
+
+    def __call__(self, input_dict):
+        """Call function to sample ground truth objects to the data.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after object sampling augmentation,
+                'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated
+                in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+
+        if self.use_ground_plane and 'plane' in input_dict['ann_info']:
+            ground_plane = input_dict['ann_info']['plane']
+            input_dict['plane'] = ground_plane
+        else:
+            ground_plane = None
+        # change to float for blending operation
+        points = input_dict['points']
+        if self.sample_2d:
+            img = input_dict['img']
+            gt_bboxes_2d = input_dict['gt_bboxes']
+            # Assume for now 3D & 2D bboxes are the same
+            sampled_dict = self.db_sampler.sample_all(
+                gt_bboxes_3d.tensor.numpy(),
+                gt_labels_3d,
+                gt_bboxes_2d=gt_bboxes_2d,
+                img=img)
+        else:
+            sampled_dict = self.db_sampler.sample_all(
+                gt_bboxes_3d.tensor.numpy(),
+                gt_labels_3d,
+                img=None,
+                ground_plane=ground_plane)
+
+        if sampled_dict is not None:
+            sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d']
+            sampled_points = sampled_dict['points']
+            sampled_gt_labels = sampled_dict['gt_labels_3d']
+
+            gt_labels_3d = np.concatenate([gt_labels_3d, sampled_gt_labels],
+                                          axis=0)
+            gt_bboxes_3d = gt_bboxes_3d.new_box(
+                np.concatenate(
+                    [gt_bboxes_3d.tensor.numpy(), sampled_gt_bboxes_3d]))
+
+            points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d)
+            # check the points dimension
+            points = points.cat([sampled_points, points])
+
+            if self.sample_2d:
+                sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d']
+                gt_bboxes_2d = np.concatenate(
+                    [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32)
+
+                input_dict['gt_bboxes'] = gt_bboxes_2d
+                input_dict['img'] = sampled_dict['img']
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d.astype(np.int64)
+        input_dict['points'] = points
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f' sample_2d={self.sample_2d},'
+        repr_str += f' data_root={self.sampler_cfg.data_root},'
+        repr_str += f' info_path={self.sampler_cfg.info_path},'
+        repr_str += f' rate={self.sampler_cfg.rate},'
+        repr_str += f' prepare={self.sampler_cfg.prepare},'
+        repr_str += f' classes={self.sampler_cfg.classes},'
+        repr_str += f' sample_groups={self.sampler_cfg.sample_groups}'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectNoise(object):
+    """Apply noise to each GT objects in the scene.
+
+    Args:
+        translation_std (list[float], optional): Standard deviation of the
+            distribution where translation noise are sampled from.
+            Defaults to [0.25, 0.25, 0.25].
+        global_rot_range (list[float], optional): Global rotation to the scene.
+            Defaults to [0.0, 0.0].
+        rot_range (list[float], optional): Object rotation range.
+            Defaults to [-0.15707963267, 0.15707963267].
+        num_try (int, optional): Number of times to try if the noise applied is
+            invalid. Defaults to 100.
+    """
+
+    def __init__(self,
+                 translation_std=[0.25, 0.25, 0.25],
+                 global_rot_range=[0.0, 0.0],
+                 rot_range=[-0.15707963267, 0.15707963267],
+                 num_try=100):
+        self.translation_std = translation_std
+        self.global_rot_range = global_rot_range
+        self.rot_range = rot_range
+        self.num_try = num_try
+
+    def __call__(self, input_dict):
+        """Call function to apply noise to each ground truth in the scene.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after adding noise to each object,
+                'points', 'gt_bboxes_3d' keys are updated in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        points = input_dict['points']
+
+        # TODO: this is inplace operation
+        numpy_box = gt_bboxes_3d.tensor.numpy()
+        numpy_points = points.tensor.numpy()
+
+        noise_per_object_v3_(
+            numpy_box,
+            numpy_points,
+            rotation_perturb=self.rot_range,
+            center_noise_std=self.translation_std,
+            global_random_rot_range=self.global_rot_range,
+            num_try=self.num_try)
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.new_box(numpy_box)
+        input_dict['points'] = points.new_point(numpy_points)
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_try={self.num_try},'
+        repr_str += f' translation_std={self.translation_std},'
+        repr_str += f' global_rot_range={self.global_rot_range},'
+        repr_str += f' rot_range={self.rot_range})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class GlobalAlignment(object):
+    """Apply global alignment to 3D scene points by rotation and translation.
+
+    Args:
+        rotation_axis (int): Rotation axis for points and bboxes rotation.
+
+    Note:
+        We do not record the applied rotation and translation as in
+            GlobalRotScaleTrans. Because usually, we do not need to reverse
+            the alignment step.
+        For example, ScanNet 3D detection task uses aligned ground-truth
+            bounding boxes for evaluation.
+    """
+
+    def __init__(self, rotation_axis):
+        self.rotation_axis = rotation_axis
+
+    def _trans_points(self, input_dict, trans_factor):
+        """Private function to translate points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            trans_factor (np.ndarray): Translation vector to be applied.
+
+        Returns:
+            dict: Results after translation, 'points' is updated in the dict.
+        """
+        input_dict['points'].translate(trans_factor)
+
+    def _rot_points(self, input_dict, rot_mat):
+        """Private function to rotate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            rot_mat (np.ndarray): Rotation matrix to be applied.
+
+        Returns:
+            dict: Results after rotation, 'points' is updated in the dict.
+        """
+        # input should be rot_mat_T so I transpose it here
+        input_dict['points'].rotate(rot_mat.T)
+
+    def _check_rot_mat(self, rot_mat):
+        """Check if rotation matrix is valid for self.rotation_axis.
+
+        Args:
+            rot_mat (np.ndarray): Rotation matrix to be applied.
+        """
+        is_valid = np.allclose(np.linalg.det(rot_mat), 1.0)
+        valid_array = np.zeros(3)
+        valid_array[self.rotation_axis] = 1.0
+        is_valid &= (rot_mat[self.rotation_axis, :] == valid_array).all()
+        is_valid &= (rot_mat[:, self.rotation_axis] == valid_array).all()
+        assert is_valid, f'invalid rotation matrix {rot_mat}'
+
+    def __call__(self, input_dict):
+        """Call function to shuffle points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after global alignment, 'points' and keys in
+                input_dict['bbox3d_fields'] are updated in the result dict.
+        """
+        assert 'axis_align_matrix' in input_dict['ann_info'].keys(), \
+            'axis_align_matrix is not provided in GlobalAlignment'
+
+        axis_align_matrix = input_dict['ann_info']['axis_align_matrix']
+        assert axis_align_matrix.shape == (4, 4), \
+            f'invalid shape {axis_align_matrix.shape} for axis_align_matrix'
+        rot_mat = axis_align_matrix[:3, :3]
+        trans_vec = axis_align_matrix[:3, -1]
+
+        self._check_rot_mat(rot_mat)
+        self._rot_points(input_dict, rot_mat)
+        self._trans_points(input_dict, trans_vec)
+
+        return input_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(rotation_axis={self.rotation_axis})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class GlobalRotScaleTrans(object):
+    """Apply global rotation, scaling and translation to a 3D scene.
+
+    Args:
+        rot_range (list[float], optional): Range of rotation angle.
+            Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
+        scale_ratio_range (list[float], optional): Range of scale ratio.
+            Defaults to [0.95, 1.05].
+        translation_std (list[float], optional): The standard deviation of
+            translation noise applied to a scene, which
+            is sampled from a gaussian distribution whose standard deviation
+            is set by ``translation_std``. Defaults to [0, 0, 0]
+        shift_height (bool, optional): Whether to shift height.
+            (the fourth dimension of indoor points) when scaling.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 rot_range=[-0.78539816, 0.78539816],
+                 scale_ratio_range=[0.95, 1.05],
+                 translation_std=[0, 0, 0],
+                 shift_height=False):
+        seq_types = (list, tuple, np.ndarray)
+        if not isinstance(rot_range, seq_types):
+            assert isinstance(rot_range, (int, float)), \
+                f'unsupported rot_range type {type(rot_range)}'
+            rot_range = [-rot_range, rot_range]
+        self.rot_range = rot_range
+
+        assert isinstance(scale_ratio_range, seq_types), \
+            f'unsupported scale_ratio_range type {type(scale_ratio_range)}'
+        self.scale_ratio_range = scale_ratio_range
+
+        if not isinstance(translation_std, seq_types):
+            assert isinstance(translation_std, (int, float)), \
+                f'unsupported translation_std type {type(translation_std)}'
+            translation_std = [
+                translation_std, translation_std, translation_std
+            ]
+        assert all([std >= 0 for std in translation_std]), \
+            'translation_std should be positive'
+        self.translation_std = translation_std
+        self.shift_height = shift_height
+
+    def _trans_bbox_points(self, input_dict):
+        """Private function to translate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after translation, 'points', 'pcd_trans'
+                and keys in input_dict['bbox3d_fields'] are updated
+                in the result dict.
+        """
+        translation_std = np.array(self.translation_std, dtype=np.float32)
+        trans_factor = np.random.normal(scale=translation_std, size=3).T
+
+        input_dict['points'].translate(trans_factor)
+        input_dict['pcd_trans'] = trans_factor
+        for key in input_dict['bbox3d_fields']:
+            input_dict[key].translate(trans_factor)
+
+    def _rot_bbox_points(self, input_dict):
+        """Private function to rotate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after rotation, 'points', 'pcd_rotation'
+                and keys in input_dict['bbox3d_fields'] are updated
+                in the result dict.
+        """
+        rotation = self.rot_range
+        noise_rotation = np.random.uniform(rotation[0], rotation[1])
+
+        # if no bbox in input_dict, only rotate points
+        if len(input_dict['bbox3d_fields']) == 0:
+            rot_mat_T = input_dict['points'].rotate(noise_rotation)
+            input_dict['pcd_rotation'] = rot_mat_T
+            input_dict['pcd_rotation_angle'] = noise_rotation
+            return
+
+        # rotate points with bboxes
+        for key in input_dict['bbox3d_fields']:
+            if len(input_dict[key].tensor) != 0:
+                points, rot_mat_T = input_dict[key].rotate(
+                    noise_rotation, input_dict['points'])
+                input_dict['points'] = points
+                input_dict['pcd_rotation'] = rot_mat_T
+                input_dict['pcd_rotation_angle'] = noise_rotation
+
+    def _scale_bbox_points(self, input_dict):
+        """Private function to scale bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'points'and keys in
+                input_dict['bbox3d_fields'] are updated in the result dict.
+        """
+        scale = input_dict['pcd_scale_factor']
+        points = input_dict['points']
+        points.scale(scale)
+        if self.shift_height:
+            assert 'height' in points.attribute_dims.keys(), \
+                'setting shift_height=True but points have no height attribute'
+            points.tensor[:, points.attribute_dims['height']] *= scale
+        input_dict['points'] = points
+
+        for key in input_dict['bbox3d_fields']:
+            input_dict[key].scale(scale)
+
+    def _random_scale(self, input_dict):
+        """Private function to randomly set the scale factor.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'pcd_scale_factor' are updated
+                in the result dict.
+        """
+        scale_factor = np.random.uniform(self.scale_ratio_range[0],
+                                         self.scale_ratio_range[1])
+        input_dict['pcd_scale_factor'] = scale_factor
+
+    def __call__(self, input_dict):
+        """Private function to rotate, scale and translate bounding boxes and
+        points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'points', 'pcd_rotation',
+                'pcd_scale_factor', 'pcd_trans' and keys in
+                input_dict['bbox3d_fields'] are updated in the result dict.
+        """
+        if 'transformation_3d_flow' not in input_dict:
+            input_dict['transformation_3d_flow'] = []
+
+        self._rot_bbox_points(input_dict)
+
+        if 'pcd_scale_factor' not in input_dict:
+            self._random_scale(input_dict)
+        self._scale_bbox_points(input_dict)
+
+        self._trans_bbox_points(input_dict)
+
+        input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(rot_range={self.rot_range},'
+        repr_str += f' scale_ratio_range={self.scale_ratio_range},'
+        repr_str += f' translation_std={self.translation_std},'
+        repr_str += f' shift_height={self.shift_height})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PointShuffle(object):
+    """Shuffle input points."""
+
+    def __call__(self, input_dict):
+        """Call function to shuffle points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        idx = input_dict['points'].shuffle()
+        idx = idx.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[idx]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[idx]
+
+        return input_dict
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class ObjectRangeFilter(object):
+    """Filter objects by the range.
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+                keys are updated in the result dict.
+        """
+        # Check points instance type and initialise bev_range
+        if isinstance(input_dict['gt_bboxes_3d'],
+                      (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            bev_range = self.pcd_range[[0, 1, 3, 4]]
+        elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
+            bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        mask = gt_bboxes_3d.in_range_bev(bev_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PointsRangeFilter(object):
+    """Filter points by the range.
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, input_dict):
+        """Call function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        points_mask = points.in_range_3d(self.pcd_range)
+        clean_points = points[points_mask]
+        input_dict['points'] = clean_points
+        points_mask = points_mask.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectNameFilter(object):
+    """Filter GT objects by their names.
+
+    Args:
+        classes (list[str]): List of class names to be kept for training.
+    """
+
+    def __init__(self, classes):
+        self.classes = classes
+        self.labels = list(range(len(self.classes)))
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by their names.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+                keys are updated in the result dict.
+        """
+        gt_labels_3d = input_dict['gt_labels_3d']
+        gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+                                  dtype=np.bool_)
+        input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
+        input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(classes={self.classes})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PointSample(object):
+    """Point sample.
+
+    Sampling data to a certain number.
+
+    Args:
+        num_points (int): Number of points to be sampled.
+        sample_range (float, optional): The range where to sample points.
+            If not None, the points with depth larger than `sample_range` are
+            prior to be sampled. Defaults to None.
+        replace (bool, optional): Whether the sampling is with or without
+            replacement. Defaults to False.
+    """
+
+    def __init__(self, num_points, sample_range=None, replace=False):
+        self.num_points = num_points
+        self.sample_range = sample_range
+        self.replace = replace
+
+    def _points_random_sampling(self,
+                                points,
+                                num_samples,
+                                sample_range=None,
+                                replace=False,
+                                return_choices=False):
+        """Points random sampling.
+
+        Sample points to a certain number.
+
+        Args:
+            points (np.ndarray | :obj:`BasePoints`): 3D Points.
+            num_samples (int): Number of samples to be sampled.
+            sample_range (float, optional): Indicating the range where the
+                points will be sampled. Defaults to None.
+            replace (bool, optional): Sampling with or without replacement.
+                Defaults to None.
+            return_choices (bool, optional): Whether return choice.
+                Defaults to False.
+        Returns:
+            tuple[np.ndarray] | np.ndarray:
+                - points (np.ndarray | :obj:`BasePoints`): 3D Points.
+                - choices (np.ndarray, optional): The generated random samples.
+        """
+        if not replace:
+            replace = (points.shape[0] < num_samples)
+        point_range = range(len(points))
+        if sample_range is not None and not replace:
+            # Only sampling the near points when len(points) >= num_samples
+            dist = np.linalg.norm(points.tensor, axis=1)
+            far_inds = np.where(dist >= sample_range)[0]
+            near_inds = np.where(dist < sample_range)[0]
+            # in case there are too many far points
+            if len(far_inds) > num_samples:
+                far_inds = np.random.choice(
+                    far_inds, num_samples, replace=False)
+            point_range = near_inds
+            num_samples -= len(far_inds)
+        choices = np.random.choice(point_range, num_samples, replace=replace)
+        if sample_range is not None and not replace:
+            choices = np.concatenate((far_inds, choices))
+            # Shuffle points after sampling
+            np.random.shuffle(choices)
+        if return_choices:
+            return points[choices], choices
+        else:
+            return points[choices]
+
+    def __call__(self, results):
+        """Call function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = results['points']
+        points, choices = self._points_random_sampling(
+            points,
+            self.num_points,
+            self.sample_range,
+            self.replace,
+            return_choices=True)
+        results['points'] = points
+
+        pts_instance_mask = results.get('pts_instance_mask', None)
+        pts_semantic_mask = results.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            pts_instance_mask = pts_instance_mask[choices]
+            results['pts_instance_mask'] = pts_instance_mask
+
+        if pts_semantic_mask is not None:
+            pts_semantic_mask = pts_semantic_mask[choices]
+            results['pts_semantic_mask'] = pts_semantic_mask
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_points={self.num_points},'
+        repr_str += f' sample_range={self.sample_range},'
+        repr_str += f' replace={self.replace})'
+
+        return repr_str
+
+
+@PIPELINES.register_module()
+class IndoorPointSample(PointSample):
+    """Indoor point sample.
+
+    Sampling data to a certain number.
+    NOTE: IndoorPointSample is deprecated in favor of PointSample
+
+    Args:
+        num_points (int): Number of points to be sampled.
+    """
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            'IndoorPointSample is deprecated in favor of PointSample')
+        super(IndoorPointSample, self).__init__(*args, **kwargs)
+
+
+@PIPELINES.register_module()
+class IndoorPatchPointSample(object):
+    r"""Indoor point sample within a patch. Modified from `PointNet++ <https://
+    github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py>`_.
+
+    Sampling data to a certain number for semantic segmentation.
+
+    Args:
+        num_points (int): Number of points to be sampled.
+        block_size (float, optional): Size of a block to sample points from.
+            Defaults to 1.5.
+        sample_rate (float, optional): Stride used in sliding patch generation.
+            This parameter is unused in `IndoorPatchPointSample` and thus has
+            been deprecated. We plan to remove it in the future.
+            Defaults to None.
+        ignore_index (int, optional): Label index that won't be used for the
+            segmentation task. This is set in PointSegClassMapping as neg_cls.
+            If not None, will be used as a patch selection criterion.
+            Defaults to None.
+        use_normalized_coord (bool, optional): Whether to use normalized xyz as
+            additional features. Defaults to False.
+        num_try (int, optional): Number of times to try if the patch selected
+            is invalid. Defaults to 10.
+        enlarge_size (float, optional): Enlarge the sampled patch to
+            [-block_size / 2 - enlarge_size, block_size / 2 + enlarge_size] as
+            an augmentation. If None, set it as 0. Defaults to 0.2.
+        min_unique_num (int, optional): Minimum number of unique points
+            the sampled patch should contain. If None, use PointNet++'s method
+            to judge uniqueness. Defaults to None.
+        eps (float, optional): A value added to patch boundary to guarantee
+            points coverage. Defaults to 1e-2.
+
+    Note:
+        This transform should only be used in the training process of point
+            cloud segmentation tasks. For the sliding patch generation and
+            inference process in testing, please refer to the `slide_inference`
+            function of `EncoderDecoder3D` class.
+    """
+
+    def __init__(self,
+                 num_points,
+                 block_size=1.5,
+                 sample_rate=None,
+                 ignore_index=None,
+                 use_normalized_coord=False,
+                 num_try=10,
+                 enlarge_size=0.2,
+                 min_unique_num=None,
+                 eps=1e-2):
+        self.num_points = num_points
+        self.block_size = block_size
+        self.ignore_index = ignore_index
+        self.use_normalized_coord = use_normalized_coord
+        self.num_try = num_try
+        self.enlarge_size = enlarge_size if enlarge_size is not None else 0.0
+        self.min_unique_num = min_unique_num
+        self.eps = eps
+
+        if sample_rate is not None:
+            warnings.warn(
+                "'sample_rate' has been deprecated and will be removed in "
+                'the future. Please remove them from your code.')
+
+    def _input_generation(self, coords, patch_center, coord_max, attributes,
+                          attribute_dims, point_type):
+        """Generating model input.
+
+        Generate input by subtracting patch center and adding additional
+            features. Currently support colors and normalized xyz as features.
+
+        Args:
+            coords (np.ndarray): Sampled 3D Points.
+            patch_center (np.ndarray): Center coordinate of the selected patch.
+            coord_max (np.ndarray): Max coordinate of all 3D Points.
+            attributes (np.ndarray): features of input points.
+            attribute_dims (dict): Dictionary to indicate the meaning of extra
+                dimension.
+            point_type (type): class of input points inherited from BasePoints.
+
+        Returns:
+            :obj:`BasePoints`: The generated input data.
+        """
+        # subtract patch center, the z dimension is not centered
+        centered_coords = coords.copy()
+        centered_coords[:, 0] -= patch_center[0]
+        centered_coords[:, 1] -= patch_center[1]
+
+        if self.use_normalized_coord:
+            normalized_coord = coords / coord_max
+            attributes = np.concatenate([attributes, normalized_coord], axis=1)
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(normalized_coord=[
+                    attributes.shape[1], attributes.shape[1] +
+                    1, attributes.shape[1] + 2
+                ]))
+
+        points = np.concatenate([centered_coords, attributes], axis=1)
+        points = point_type(
+            points, points_dim=points.shape[1], attribute_dims=attribute_dims)
+
+        return points
+
+    def _patch_points_sampling(self, points, sem_mask):
+        """Patch points sampling.
+
+        First sample a valid patch.
+        Then sample points within that patch to a certain number.
+
+        Args:
+            points (:obj:`BasePoints`): 3D Points.
+            sem_mask (np.ndarray): semantic segmentation mask for input points.
+
+        Returns:
+            tuple[:obj:`BasePoints`, np.ndarray] | :obj:`BasePoints`:
+
+                - points (:obj:`BasePoints`): 3D Points.
+                - choices (np.ndarray): The generated random samples.
+        """
+        coords = points.coord.numpy()
+        attributes = points.tensor[:, 3:].numpy()
+        attribute_dims = points.attribute_dims
+        point_type = type(points)
+
+        coord_max = np.amax(coords, axis=0)
+        coord_min = np.amin(coords, axis=0)
+
+        for _ in range(self.num_try):
+            # random sample a point as patch center
+            cur_center = coords[np.random.choice(coords.shape[0])]
+
+            # boundary of a patch, which would be enlarged by
+            # `self.enlarge_size` as an augmentation
+            cur_max = cur_center + np.array(
+                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
+            cur_min = cur_center - np.array(
+                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
+            cur_max[2] = coord_max[2]
+            cur_min[2] = coord_min[2]
+            cur_choice = np.sum(
+                (coords >= (cur_min - self.enlarge_size)) *
+                (coords <= (cur_max + self.enlarge_size)),
+                axis=1) == 3
+
+            if not cur_choice.any():  # no points in this patch
+                continue
+
+            cur_coords = coords[cur_choice, :]
+            cur_sem_mask = sem_mask[cur_choice]
+            point_idxs = np.where(cur_choice)[0]
+            mask = np.sum(
+                (cur_coords >= (cur_min - self.eps)) * (cur_coords <=
+                                                        (cur_max + self.eps)),
+                axis=1) == 3
+
+            # two criteria for patch sampling, adopted from PointNet++
+            # 1. selected patch should contain enough unique points
+            if self.min_unique_num is None:
+                # use PointNet++'s method as default
+                # [31, 31, 62] are just some big values used to transform
+                # coords from 3d array to 1d and then check their uniqueness
+                # this is used in all the ScanNet code following PointNet++
+                vidx = np.ceil(
+                    (cur_coords[mask, :] - cur_min) / (cur_max - cur_min) *
+                    np.array([31.0, 31.0, 62.0]))
+                vidx = np.unique(vidx[:, 0] * 31.0 * 62.0 + vidx[:, 1] * 62.0 +
+                                 vidx[:, 2])
+                flag1 = len(vidx) / 31.0 / 31.0 / 62.0 >= 0.02
+            else:
+                # if `min_unique_num` is provided, directly compare with it
+                flag1 = mask.sum() >= self.min_unique_num
+
+            # 2. selected patch should contain enough annotated points
+            if self.ignore_index is None:
+                flag2 = True
+            else:
+                flag2 = np.sum(cur_sem_mask != self.ignore_index) / \
+                               len(cur_sem_mask) >= 0.7
+
+            if flag1 and flag2:
+                break
+
+        # sample idx to `self.num_points`
+        if point_idxs.size >= self.num_points:
+            # no duplicate in sub-sampling
+            choices = np.random.choice(
+                point_idxs, self.num_points, replace=False)
+        else:
+            # do not use random choice here to avoid some points not counted
+            dup = np.random.choice(point_idxs.size,
+                                   self.num_points - point_idxs.size)
+            idx_dup = np.concatenate(
+                [np.arange(point_idxs.size),
+                 np.array(dup)], 0)
+            choices = point_idxs[idx_dup]
+
+        # construct model input
+        points = self._input_generation(coords[choices], cur_center, coord_max,
+                                        attributes[choices], attribute_dims,
+                                        point_type)
+
+        return points, choices
+
+    def __call__(self, results):
+        """Call function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = results['points']
+
+        assert 'pts_semantic_mask' in results.keys(), \
+            'semantic mask should be provided in training and evaluation'
+        pts_semantic_mask = results['pts_semantic_mask']
+
+        points, choices = self._patch_points_sampling(points,
+                                                      pts_semantic_mask)
+
+        results['points'] = points
+        results['pts_semantic_mask'] = pts_semantic_mask[choices]
+        pts_instance_mask = results.get('pts_instance_mask', None)
+        if pts_instance_mask is not None:
+            results['pts_instance_mask'] = pts_instance_mask[choices]
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_points={self.num_points},'
+        repr_str += f' block_size={self.block_size},'
+        repr_str += f' ignore_index={self.ignore_index},'
+        repr_str += f' use_normalized_coord={self.use_normalized_coord},'
+        repr_str += f' num_try={self.num_try},'
+        repr_str += f' enlarge_size={self.enlarge_size},'
+        repr_str += f' min_unique_num={self.min_unique_num},'
+        repr_str += f' eps={self.eps})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class BackgroundPointsFilter(object):
+    """Filter background points near the bounding box.
+
+    Args:
+        bbox_enlarge_range (tuple[float], float): Bbox enlarge range.
+    """
+
+    def __init__(self, bbox_enlarge_range):
+        assert (is_tuple_of(bbox_enlarge_range, float)
+                and len(bbox_enlarge_range) == 3) \
+            or isinstance(bbox_enlarge_range, float), \
+            f'Invalid arguments bbox_enlarge_range {bbox_enlarge_range}'
+
+        if isinstance(bbox_enlarge_range, float):
+            bbox_enlarge_range = [bbox_enlarge_range] * 3
+        self.bbox_enlarge_range = np.array(
+            bbox_enlarge_range, dtype=np.float32)[np.newaxis, :]
+
+    def __call__(self, input_dict):
+        """Call function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+
+        # avoid groundtruth being modified
+        gt_bboxes_3d_np = gt_bboxes_3d.tensor.clone().numpy()
+        gt_bboxes_3d_np[:, :3] = gt_bboxes_3d.gravity_center.clone().numpy()
+
+        enlarged_gt_bboxes_3d = gt_bboxes_3d_np.copy()
+        enlarged_gt_bboxes_3d[:, 3:6] += self.bbox_enlarge_range
+        points_numpy = points.tensor.clone().numpy()
+        foreground_masks = box_np_ops.points_in_rbbox(
+            points_numpy, gt_bboxes_3d_np, origin=(0.5, 0.5, 0.5))
+        enlarge_foreground_masks = box_np_ops.points_in_rbbox(
+            points_numpy, enlarged_gt_bboxes_3d, origin=(0.5, 0.5, 0.5))
+        foreground_masks = foreground_masks.max(1)
+        enlarge_foreground_masks = enlarge_foreground_masks.max(1)
+        valid_masks = ~np.logical_and(~foreground_masks,
+                                      enlarge_foreground_masks)
+
+        input_dict['points'] = points[valid_masks]
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[valid_masks]
+
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[valid_masks]
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(bbox_enlarge_range={self.bbox_enlarge_range.tolist()})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class VoxelBasedPointSampler(object):
+    """Voxel based point sampler.
+
+    Apply voxel sampling to multiple sweep points.
+
+    Args:
+        cur_sweep_cfg (dict): Config for sampling current points.
+        prev_sweep_cfg (dict): Config for sampling previous points.
+        time_dim (int): Index that indicate the time dimension
+            for input points.
+    """
+
+    def __init__(self, cur_sweep_cfg, prev_sweep_cfg=None, time_dim=3):
+        self.cur_voxel_generator = VoxelGenerator(**cur_sweep_cfg)
+        self.cur_voxel_num = self.cur_voxel_generator._max_voxels
+        self.time_dim = time_dim
+        if prev_sweep_cfg is not None:
+            assert prev_sweep_cfg['max_num_points'] == \
+                cur_sweep_cfg['max_num_points']
+            self.prev_voxel_generator = VoxelGenerator(**prev_sweep_cfg)
+            self.prev_voxel_num = self.prev_voxel_generator._max_voxels
+        else:
+            self.prev_voxel_generator = None
+            self.prev_voxel_num = 0
+
+    def _sample_points(self, points, sampler, point_dim):
+        """Sample points for each points subset.
+
+        Args:
+            points (np.ndarray): Points subset to be sampled.
+            sampler (VoxelGenerator): Voxel based sampler for
+                each points subset.
+            point_dim (int): The dimension of each points
+
+        Returns:
+            np.ndarray: Sampled points.
+        """
+        voxels, coors, num_points_per_voxel = sampler.generate(points)
+        if voxels.shape[0] < sampler._max_voxels:
+            padding_points = np.zeros([
+                sampler._max_voxels - voxels.shape[0], sampler._max_num_points,
+                point_dim
+            ],
+                                      dtype=points.dtype)
+            padding_points[:] = voxels[0]
+            sample_points = np.concatenate([voxels, padding_points], axis=0)
+        else:
+            sample_points = voxels
+
+        return sample_points
+
+    def __call__(self, results):
+        """Call function to sample points from multiple sweeps.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = results['points']
+        original_dim = points.shape[1]
+
+        # TODO: process instance and semantic mask while _max_num_points
+        # is larger than 1
+        # Extend points with seg and mask fields
+        map_fields2dim = []
+        start_dim = original_dim
+        points_numpy = points.tensor.numpy()
+        extra_channel = [points_numpy]
+        for idx, key in enumerate(results['pts_mask_fields']):
+            map_fields2dim.append((key, idx + start_dim))
+            extra_channel.append(results[key][..., None])
+
+        start_dim += len(results['pts_mask_fields'])
+        for idx, key in enumerate(results['pts_seg_fields']):
+            map_fields2dim.append((key, idx + start_dim))
+            extra_channel.append(results[key][..., None])
+
+        points_numpy = np.concatenate(extra_channel, axis=-1)
+
+        # Split points into two part, current sweep points and
+        # previous sweeps points.
+        # TODO: support different sampling methods for next sweeps points
+        # and previous sweeps points.
+        cur_points_flag = (points_numpy[:, self.time_dim] == 0)
+        cur_sweep_points = points_numpy[cur_points_flag]
+        prev_sweeps_points = points_numpy[~cur_points_flag]
+        if prev_sweeps_points.shape[0] == 0:
+            prev_sweeps_points = cur_sweep_points
+
+        # Shuffle points before sampling
+        np.random.shuffle(cur_sweep_points)
+        np.random.shuffle(prev_sweeps_points)
+
+        cur_sweep_points = self._sample_points(cur_sweep_points,
+                                               self.cur_voxel_generator,
+                                               points_numpy.shape[1])
+        if self.prev_voxel_generator is not None:
+            prev_sweeps_points = self._sample_points(prev_sweeps_points,
+                                                     self.prev_voxel_generator,
+                                                     points_numpy.shape[1])
+
+            points_numpy = np.concatenate(
+                [cur_sweep_points, prev_sweeps_points], 0)
+        else:
+            points_numpy = cur_sweep_points
+
+        if self.cur_voxel_generator._max_num_points == 1:
+            points_numpy = points_numpy.squeeze(1)
+        results['points'] = points.new_point(points_numpy[..., :original_dim])
+
+        # Restore the corresponding seg and mask fields
+        for key, dim_index in map_fields2dim:
+            results[key] = points_numpy[..., dim_index]
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+
+        def _auto_indent(repr_str, indent):
+            repr_str = repr_str.split('\n')
+            repr_str = [' ' * indent + t + '\n' for t in repr_str]
+            repr_str = ''.join(repr_str)[:-1]
+            return repr_str
+
+        repr_str = self.__class__.__name__
+        indent = 4
+        repr_str += '(\n'
+        repr_str += ' ' * indent + f'num_cur_sweep={self.cur_voxel_num},\n'
+        repr_str += ' ' * indent + f'num_prev_sweep={self.prev_voxel_num},\n'
+        repr_str += ' ' * indent + f'time_dim={self.time_dim},\n'
+        repr_str += ' ' * indent + 'cur_voxel_generator=\n'
+        repr_str += f'{_auto_indent(repr(self.cur_voxel_generator), 8)},\n'
+        repr_str += ' ' * indent + 'prev_voxel_generator=\n'
+        repr_str += f'{_auto_indent(repr(self.prev_voxel_generator), 8)})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class AffineResize(object):
+    """Get the affine transform matrices to the target size.
+
+    Different from :class:`RandomAffine` in MMDetection, this class can
+    calculate the affine transform matrices while resizing the input image
+    to a fixed size. The affine transform matrices include: 1) matrix
+    transforming original image to the network input image size. 2) matrix
+    transforming original image to the network output feature map size.
+
+    Args:
+        img_scale (tuple): Images scales for resizing.
+        down_ratio (int): The down ratio of feature map.
+            Actually the arg should be >= 1.
+        bbox_clip_border (bool, optional): Whether clip the objects
+            outside the border of the image. Defaults to True.
+    """
+
+    def __init__(self, img_scale, down_ratio, bbox_clip_border=True):
+
+        self.img_scale = img_scale
+        self.down_ratio = down_ratio
+        self.bbox_clip_border = bbox_clip_border
+
+    def __call__(self, results):
+        """Call function to do affine transform to input image and labels.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after affine resize, 'affine_aug', 'trans_mat'
+                keys are added in the result dict.
+        """
+        # The results have gone through RandomShiftScale before AffineResize
+        if 'center' not in results:
+            img = results['img']
+            height, width = img.shape[:2]
+            center = np.array([width / 2, height / 2], dtype=np.float32)
+            size = np.array([width, height], dtype=np.float32)
+            results['affine_aug'] = False
+        else:
+            # The results did not go through RandomShiftScale before
+            # AffineResize
+            img = results['img']
+            center = results['center']
+            size = results['size']
+
+        trans_affine = self._get_transform_matrix(center, size, self.img_scale)
+
+        img = cv2.warpAffine(img, trans_affine[:2, :], self.img_scale)
+
+        if isinstance(self.down_ratio, tuple):
+            trans_mat = [
+                self._get_transform_matrix(
+                    center, size,
+                    (self.img_scale[0] // ratio, self.img_scale[1] // ratio))
+                for ratio in self.down_ratio
+            ]  # (3, 3)
+        else:
+            trans_mat = self._get_transform_matrix(
+                center, size, (self.img_scale[0] // self.down_ratio,
+                               self.img_scale[1] // self.down_ratio))
+
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['pad_shape'] = img.shape
+        results['trans_mat'] = trans_mat
+
+        self._affine_bboxes(results, trans_affine)
+
+        if 'centers2d' in results:
+            centers2d = self._affine_transform(results['centers2d'],
+                                               trans_affine)
+            valid_index = (centers2d[:, 0] >
+                           0) & (centers2d[:, 0] <
+                                 self.img_scale[0]) & (centers2d[:, 1] > 0) & (
+                                     centers2d[:, 1] < self.img_scale[1])
+            results['centers2d'] = centers2d[valid_index]
+
+            for key in results.get('bbox_fields', []):
+                if key in ['gt_bboxes']:
+                    results[key] = results[key][valid_index]
+                    if 'gt_labels' in results:
+                        results['gt_labels'] = results['gt_labels'][
+                            valid_index]
+                    if 'gt_masks' in results:
+                        raise NotImplementedError(
+                            'AffineResize only supports bbox.')
+
+            for key in results.get('bbox3d_fields', []):
+                if key in ['gt_bboxes_3d']:
+                    results[key].tensor = results[key].tensor[valid_index]
+                    if 'gt_labels_3d' in results:
+                        results['gt_labels_3d'] = results['gt_labels_3d'][
+                            valid_index]
+
+            results['depths'] = results['depths'][valid_index]
+
+        return results
+
+    def _affine_bboxes(self, results, matrix):
+        """Affine transform bboxes to input image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            matrix (np.ndarray): Matrix transforming original
+                image to the network input image size.
+                shape: (3, 3)
+        """
+
+        for key in results.get('bbox_fields', []):
+            bboxes = results[key]
+            bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix)
+            bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix)
+            if self.bbox_clip_border:
+                bboxes[:,
+                       [0, 2]] = bboxes[:,
+                                        [0, 2]].clip(0, self.img_scale[0] - 1)
+                bboxes[:,
+                       [1, 3]] = bboxes[:,
+                                        [1, 3]].clip(0, self.img_scale[1] - 1)
+            results[key] = bboxes
+
+    def _affine_transform(self, points, matrix):
+        """Affine transform bbox points to input image.
+
+        Args:
+            points (np.ndarray): Points to be transformed.
+                shape: (N, 2)
+            matrix (np.ndarray): Affine transform matrix.
+                shape: (3, 3)
+
+        Returns:
+            np.ndarray: Transformed points.
+        """
+        num_points = points.shape[0]
+        hom_points_2d = np.concatenate((points, np.ones((num_points, 1))),
+                                       axis=1)
+        hom_points_2d = hom_points_2d.T
+        affined_points = np.matmul(matrix, hom_points_2d).T
+        return affined_points[:, :2]
+
+    def _get_transform_matrix(self, center, scale, output_scale):
+        """Get affine transform matrix.
+
+        Args:
+            center (tuple): Center of current image.
+            scale (tuple): Scale of current image.
+            output_scale (tuple[float]): The transform target image scales.
+
+        Returns:
+            np.ndarray: Affine transform matrix.
+        """
+        # TODO: further add rot and shift here.
+        src_w = scale[0]
+        dst_w = output_scale[0]
+        dst_h = output_scale[1]
+
+        src_dir = np.array([0, src_w * -0.5])
+        dst_dir = np.array([0, dst_w * -0.5])
+
+        src = np.zeros((3, 2), dtype=np.float32)
+        dst = np.zeros((3, 2), dtype=np.float32)
+        src[0, :] = center
+        src[1, :] = center + src_dir
+        dst[0, :] = np.array([dst_w * 0.5, dst_h * 0.5])
+        dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+
+        src[2, :] = self._get_ref_point(src[0, :], src[1, :])
+        dst[2, :] = self._get_ref_point(dst[0, :], dst[1, :])
+
+        get_matrix = cv2.getAffineTransform(src, dst)
+
+        matrix = np.concatenate((get_matrix, [[0., 0., 1.]]))
+
+        return matrix.astype(np.float32)
+
+    def _get_ref_point(self, ref_point1, ref_point2):
+        """Get reference point to calculate affine transform matrix.
+
+        While using opencv to calculate the affine matrix, we need at least
+        three corresponding points separately on original image and target
+        image. Here we use two points to get the the third reference point.
+        """
+        d = ref_point1 - ref_point2
+        ref_point3 = ref_point2 + np.array([-d[1], d[0]])
+        return ref_point3
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'down_ratio={self.down_ratio}) '
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomShiftScale(object):
+    """Random shift scale.
+
+    Different from the normal shift and scale function, it doesn't
+    directly shift or scale image. It can record the shift and scale
+    infos into loading pipelines. It's designed to be used with
+    AffineResize together.
+
+    Args:
+        shift_scale (tuple[float]): Shift and scale range.
+        aug_prob (float): The shifting and scaling probability.
+    """
+
+    def __init__(self, shift_scale, aug_prob):
+
+        self.shift_scale = shift_scale
+        self.aug_prob = aug_prob
+
+    def __call__(self, results):
+        """Call function to record random shift and scale infos.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after random shift and scale, 'center', 'size'
+                and 'affine_aug' keys are added in the result dict.
+        """
+        img = results['img']
+
+        height, width = img.shape[:2]
+
+        center = np.array([width / 2, height / 2], dtype=np.float32)
+        size = np.array([width, height], dtype=np.float32)
+
+        if random.random() < self.aug_prob:
+            shift, scale = self.shift_scale[0], self.shift_scale[1]
+            shift_ranges = np.arange(-shift, shift + 0.1, 0.1)
+            center[0] += size[0] * random.choice(shift_ranges)
+            center[1] += size[1] * random.choice(shift_ranges)
+            scale_ranges = np.arange(1 - scale, 1 + scale + 0.1, 0.1)
+            size *= random.choice(scale_ranges)
+            results['affine_aug'] = True
+        else:
+            results['affine_aug'] = False
+
+        results['center'] = center
+        results['size'] = size
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(shift_scale={self.shift_scale}, '
+        repr_str += f'aug_prob={self.aug_prob}) '
+        return repr_str
+
+@PIPELINES.register_module()
+class DataFilter(object):
+    """Point sample.
+
+    Sampling data to a certain number.
+
+    Args:
+        num_points (int): Number of points to be sampled.
+        sample_range (float, optional): The range where to sample points.
+            If not None, the points with depth larger than `sample_range` are
+            prior to be sampled. Defaults to None.
+        replace (bool, optional): Whether the sampling is with or without
+            replacement. Defaults to False.
+    """
+
+    def __init__(self, method='remove_statistical_outlier', params={"nb_neighbors" : 20, "std_ratio" : 0.05}):
+        self.method = method
+        self.params = params
+
+    def remove_statistical_outlier(self, lidar_data, nb_neighbors, std_ratio):
+
+        points = lidar_data['points'].tensor.numpy()
+
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+        filtered_pcd, ind = pcd.remove_statistical_outlier(nb_neighbors, std_ratio)
+
+        filterer_points = np.asarray(filtered_pcd.points)
+        if (points.shape[1] > 3):
+            filterer_points = np.concatenate((filterer_points, points[ind, 3:]), axis=-1)
+
+        filtered_lidar_data = lidar_data
+        filtered_lidar_data["points"] = LiDARPoints(filterer_points.astype(np.float32), points_dim=filterer_points.shape[-1], attribute_dims=None)
+
+        return filtered_lidar_data
+
+    def remove_radius_outlier(self, lidar_data, nb_points, radius):
+
+        points = lidar_data['points'].tensor.numpy()
+
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+        filtered_pcd, ind = pcd.remove_radius_outlier(nb_points, radius)
+
+        filterer_points = np.asarray(filtered_pcd.points)
+        if (points.shape[1] > 3):
+            filterer_points = np.concatenate((filterer_points, points[ind, 3:]), axis=-1)
+
+        filtered_lidar_data = lidar_data
+        filtered_lidar_data["points"] = LiDARPoints(filterer_points.astype(np.float32), points_dim=filterer_points.shape[-1], attribute_dims=None)
+
+        return filtered_lidar_data
+
+    def voxel_down_sample(self, lidar_data, voxel_size):
+
+        points = lidar_data['points'].tensor.numpy()
+
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+        # filtered_pcd, ind = pcd.voxel_down_sample(voxel_size)
+        filtered_pcd = pcd.voxel_down_sample(voxel_size)
+
+        filterer_points = np.asarray(filtered_pcd.points)
+
+        filtered_lidar_data = lidar_data
+        filtered_lidar_data["points"] = LiDARPoints(filterer_points.astype(np.float32), points_dim=filterer_points.shape[-1], attribute_dims=None)
+        return filtered_lidar_data
+
+
+    def __call__(self, results):
+        """Call function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        if self.method == 'remove_statistical_outlier':
+            self.remove_statistical_outlier(results, **self.params)
+        elif self.method == 'remove_radius_outlier':
+            self.remove_radius_outlier(results, **self.params)
+        elif self.method == 'voxel_down_sample':
+            self.voxel_down_sample(results, **self.params)
+
+        return results
+
+@PIPELINES.register_module()
+class DataAugmentor(object):
+    """Point sample.
+
+    Sampling data to a certain number.
+
+    Args:
+        num_points (int): Number of points to be sampled.
+        sample_range (float, optional): The range where to sample points.
+            If not None, the points with depth larger than `sample_range` are
+            prior to be sampled. Defaults to None.
+        replace (bool, optional): Whether the sampling is with or without
+            replacement. Defaults to False.
+    """
+
+    def __init__(self, method, params=None):
+        self.method = method
+        self.params = params
+
+    def random_flip_along_x(self, results):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        enable = np.random.choice([False, True], replace=False, p=[0.5, 0.5])
+
+        if enable:
+
+            gt_boxes[:, 1] = -gt_boxes[:, 1]
+            gt_boxes[:, 6] = -gt_boxes[:, 6]
+            points[:, 1] = -points[:, 1]
+
+            if gt_boxes.shape[1] > 7:
+                gt_boxes[:, 8] = -gt_boxes[:, 8]
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def random_flip_along_y(self, results):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        enable = np.random.choice([False, True], replace=False, p=[0.5, 0.5])
+
+        if enable:
+            gt_boxes[:, 0] = -gt_boxes[:, 0]
+            gt_boxes[:, 6] = -(gt_boxes[:, 6] + np.pi)
+            # points[:, 0] = -points[:, 0]
+
+            if gt_boxes.shape[1] > 7:
+                gt_boxes[:, 7] = -gt_boxes[:, 7]
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def global_scaling(self, results, scale_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        if scale_range[1] - scale_range[0] < 1e-3:
+            return results
+        noise_scale = np.random.uniform(scale_range[0], scale_range[1])
+        points[:, :3] *= noise_scale
+        gt_boxes[:, :6] *= noise_scale
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def random_translation_along_x(self, results, offset_std):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        offset = np.random.normal(0, offset_std, 1)
+
+        points[:, 0] += offset
+        gt_boxes[:, 0] += offset
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def random_translation_along_y(self, results, offset_std):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        offset = np.random.normal(0, offset_std, 1)
+
+        points[:, 1] += offset
+        gt_boxes[:, 1] += offset
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def random_translation_along_z(self, results, offset_std):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        offset = np.random.normal(0, offset_std, 1)
+
+        points[:, 2] += offset
+        gt_boxes[:, 2] += offset
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def global_frustum_dropout_top(self, results, intensity_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        intensity = np.random.uniform(intensity_range[0], intensity_range[1])
+        # threshold = max - length * uniform(0 ~ 0.2)
+        threshold = np.max(points[:, 2]) - intensity * (np.max(points[:, 2]) - np.min(points[:, 2]))
+
+        points = points[points[:, 2] < threshold]
+        gt_boxes = gt_boxes[gt_boxes[:, 2] < threshold]
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def global_frustum_dropout_bottom(self, results, intensity_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        intensity = np.random.uniform(intensity_range[0], intensity_range[1])
+
+        threshold = np.min(points[:, 2]) + intensity * (np.max(points[:, 2]) - np.min(points[:, 2]))
+        points = points[points[:, 2] > threshold]
+        gt_boxes = gt_boxes[gt_boxes[:, 2] > threshold]
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def global_frustum_dropout_left(self, results, intensity_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        intensity = np.random.uniform(intensity_range[0], intensity_range[1])
+
+        threshold = np.max(points[:, 1]) - intensity * (np.max(points[:, 1]) - np.min(points[:, 1]))
+        points = points[points[:, 1] < threshold]
+        gt_boxes = gt_boxes[gt_boxes[:, 1] < threshold]
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def global_frustum_dropout_right(self, results, intensity_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        intensity = np.random.uniform(intensity_range[0], intensity_range[1])
+
+        threshold = np.min(points[:, 1]) + intensity * (np.max(points[:, 1]) - np.min(points[:, 1]))
+        points = points[points[:, 1] > threshold]
+        gt_boxes = gt_boxes[gt_boxes[:, 1] > threshold]
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def local_scaling(self, results, scale_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        if scale_range[1] - scale_range[0] < 1e-3:
+            return results
+
+        # augs = {}
+        for idx, box in enumerate(gt_boxes):
+            noise_scale = np.random.uniform(scale_range[0], scale_range[1])
+            # augs[f'object_{idx}'] = noise_scale
+            points_in_box, mask = get_points_in_box(points, box)
+
+            # tranlation to axis center
+            points[mask, 0] -= box[0]
+            points[mask, 1] -= box[1]
+            points[mask, 2] -= box[2]
+
+            # apply scaling
+            points[mask, :3] *= noise_scale
+
+            # tranlation back to original position
+            points[mask, 0] += box[0]
+            points[mask, 1] += box[1]
+            points[mask, 2] += box[2]
+
+            gt_boxes[idx, 3:6] *= noise_scale
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def random_local_translation_along_x(self, results, offset_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        for idx, box in enumerate(gt_boxes):
+            offset = np.random.uniform(offset_range[0], offset_range[1])
+            # augs[f'object_{idx}'] = offset
+            points_in_box, mask = get_points_in_box(points, box)
+            points[mask, 0] += offset
+
+            gt_boxes[idx, 0] += offset
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def random_local_translation_along_y(self, results, offset_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        for idx, box in enumerate(gt_boxes):
+            offset = np.random.uniform(offset_range[0], offset_range[1])
+            # augs[f'object_{idx}'] = offset
+            points_in_box, mask = get_points_in_box(points, box)
+            points[mask, 1] += offset
+
+            gt_boxes[idx, 1] += offset
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def random_local_translation_along_z(self, results, offset_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        for idx, box in enumerate(gt_boxes):
+            offset = np.random.uniform(offset_range[0], offset_range[1])
+            # augs[f'object_{idx}'] = offset
+            points_in_box, mask = get_points_in_box(points, box)
+            points[mask, 2] += offset
+
+            gt_boxes[idx, 2] += offset
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def local_frustum_dropout_top(self, results, intensity_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        for idx, box in enumerate(gt_boxes):
+            x, y, z, dx, dy, dz = box[0], box[1], box[2], box[3], box[4], box[5]
+
+            intensity = np.random.uniform(intensity_range[0], intensity_range[1])
+            points_in_box, mask = get_points_in_box(points, box)
+            threshold = (z + dz / 2) - intensity * dz
+
+            points = points[np.logical_not(np.logical_and(mask, points[:, 2] >= threshold))]
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def local_frustum_dropout_bottle(self, results, intensity_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        for idx, box in enumerate(gt_boxes):
+            x, y, z, dx, dy, dz = box[0], box[1], box[2], box[3], box[4], box[5]
+
+            intensity = np.random.uniform(intensity_range[0], intensity_range[1])
+            points_in_box, mask = get_points_in_box(points, box)
+            threshold = (z - dz / 2) + intensity * dz
+
+            points = points[np.logical_not(np.logical_and(mask, points[:, 2] <= threshold))]
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def local_frustum_dropout_left(self, results, intensity_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        for idx, box in enumerate(gt_boxes):
+            x, y, z, dx, dy, dz = box[0], box[1], box[2], box[3], box[4], box[5]
+
+            intensity = np.random.uniform(intensity_range[0], intensity_range[1])
+            points_in_box, mask = get_points_in_box(points, box)
+            threshold = (y + dy / 2) - intensity * dy
+
+            points = points[np.logical_not(np.logical_and(mask, points[:, 1] >= threshold))]
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def local_frustum_dropout_right(self, results, intensity_range):
+
+        gt_boxes = results['gt_bboxes_3d'].tensor.numpy()
+        points = results['points'].tensor.numpy()
+        rect = results['ann_info']['rect']
+        Trv2c = results['ann_info']['Trv2c']
+
+        for idx, box in enumerate(gt_boxes):
+            x, y, z, dx, dy, dz = box[0], box[1], box[2], box[3], box[4], box[5]
+
+            intensity = np.random.uniform(intensity_range[0], intensity_range[1])
+            points_in_box, mask = get_points_in_box(points, box)
+            threshold = (y - dy / 2) + intensity * dy
+
+            points = points[np.logical_not(np.logical_and(mask, points[:, 1] <= threshold))]
+
+        results["points"] = LiDARPoints(points.astype(np.float32),
+                                                    points_dim=points.shape[-1], attribute_dims=None)
+        results['gt_bboxes_3d'] = CameraInstance3DBoxes(gt_boxes).convert_to(
+            Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c))
+        return results
+
+    def __call__(self, results):
+        """Call function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+
+        if self.method == 'random_flip_along_x':
+            results = self.random_flip_along_x(results)
+        elif self.method == 'random_flip_along_y':
+            results = self.random_flip_along_y(results)
+
+        # elif self.method == 'global_rotation':
+        #     gt_boxes, points = self.global_rotation(results, **self.params)
+        elif self.method == 'global_scaling':
+            results = self.global_scaling(results, self.params)
+
+        elif self.method == 'random_translation_along_x':
+            results = self.random_translation_along_x(results, self.params)
+        elif self.method == 'random_translation_along_y':
+            results = self.random_translation_along_y(results, self.params)
+        elif self.method == 'random_translation_along_z':
+            results = self.random_translation_along_z(results, self.params)
+
+        elif self.method == 'global_frustum_dropout_top':
+            results = self.global_frustum_dropout_top(results, self.params)
+        elif self.method == 'global_frustum_dropout_bottom':
+            results = self.global_frustum_dropout_bottom(results, self.params)
+        elif self.method == 'global_frustum_dropout_left':
+            results = self.global_frustum_dropout_left(results, self.params)
+        elif self.method == 'global_frustum_dropout_right':
+            results = self.global_frustum_dropout_right(results, self.params)
+
+        elif self.method == 'local_scaling':
+            results = self.local_scaling(results, self.params)
+        # elif self.method == 'local_rotation':
+        #     results = self.local_rotation(results, self.params)
+
+        elif self.method == 'random_local_translation_along_x':
+            results = self.random_local_translation_along_x(results, self.params)
+        elif self.method == 'random_local_translation_along_y':
+            results = self.random_local_translation_along_y(results, self.params)
+        elif self.method == 'random_local_translation_along_z':
+            results = self.random_local_translation_along_z(results, self.params)
+
+        elif self.method == 'local_frustum_dropout_top':
+            results = self.local_frustum_dropout_top(results, self.params)
+        elif self.method == 'local_frustum_dropout_bottom':
+            results = self.local_frustum_dropout_bottom(results, self.params)
+        elif self.method == 'local_frustum_dropout_left':
+            results = self.local_frustum_dropout_left(results, self.params)
+        elif self.method == 'local_frustum_dropout_right':
+            results = self.local_frustum_dropout_right(results, self.params)
+
+        return results
+
+@PIPELINES.register_module()
+class DataDenoisor(object):
+    """Point sample.
+
+    Sampling data to a certain number.
+
+    Args:
+        num_points (int): Number of points to be sampled.
+        sample_range (float, optional): The range where to sample points.
+            If not None, the points with depth larger than `sample_range` are
+            prior to be sampled. Defaults to None.
+        replace (bool, optional): Whether the sampling is with or without
+            replacement. Defaults to False.
+    """
+
+    def __init__(self, method):
+        self.method = method
+
+    def __call__(self, results):
+        """Call function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask'
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = results['points'].tensor.numpy()
+        points = torch.tensor(points[:, :3])
+
+        if self.method == 'pcp':
+            param_filename = 'deephub/denoisy_model/pcp/pretrained/denoisingModel/PointCleanNet_params.pth'
+            model_filename = 'deephub/denoisy_model/pcp/pretrained/denoisingModel/PointCleanNet_model.pth'
+            trainopt = torch.load(param_filename)
+            pred_dim = 0
+            output_pred_ind = []
+            for o in trainopt.outputs:
+                if o in ['clean_points']:
+                    output_pred_ind.append(pred_dim)
+                    pred_dim += 3
+                else:
+                    raise ValueError('Unknown output: %s' % (o))
+
+            regressor = ResPCPNet(
+                num_points=trainopt.points_per_patch,
+                output_dim=pred_dim,
+                use_point_stn=trainopt.use_point_stn,
+                use_feat_stn=trainopt.use_feat_stn,
+                sym_op=trainopt.sym_op,
+                point_tuple=trainopt.point_tuple)
+            state_dict = torch.load(model_filename,map_location='cpu')
+            regressor.load_state_dict(state_dict)
+
+            pred, trans, _, _ = regressor(points)
+            patch_radiuses = torch.FloatTensor([0.05])
+
+            denoised = pred
+        elif self.method == 'dmr':
+            num_points = points.shape[0]
+            if num_points >= 120000:
+                print('[INFO] Denoising large point cloud.')
+                denoised, downsampled = run_denoise_large_pointcloud(
+                    pc=points,
+                    cluster_size=30000,
+                    patch_size=1000,
+                    ckpt='deephub/denoisy_model/dmr/pretrained/supervised/epoch=153.ckpt',
+                    device='cuda:0',
+                    random_state=0,
+                    expand_knn=16
+                )
+            elif num_points >= 60000:
+                print('[INFO] Denoising middle-sized point cloud.')
+                denoised, downsampled = run_denoise_middle_pointcloud(
+                    pc=points,
+                    num_splits=2,
+                    patch_size=1000,
+                    ckpt='deephub/denoisy_model/dmr/pretrained/supervised/epoch=153.ckpt',
+                    device='cuda:0',
+                    random_state=0,
+                    expand_knn=16
+                )
+            elif num_points >= 10000:
+                print('[INFO] Denoising regular-sized point cloud.')
+                denoised, downsampled = run_denoise(
+                    pc=points,
+                    patch_size=1000,
+                    ckpt='deephub/denoisy_model/dmr/pretrained/supervised/epoch=153.ckpt',
+                    device='cuda:0',
+                    random_state=0,
+                    expand_knn=16
+                )
+            else:
+                assert False, "Our pretrained model does not support point clouds with less than 10K points."
+        results["points"] = LiDARPoints(denoised.astype(np.float32),
+                                        points_dim=denoised.shape[-1], attribute_dims=None)
         return results
\ No newline at end of file
diff --git a/mmdet3d/datasets/s3dis_dataset.py b/mmdet3d/datasets/s3dis_dataset.py
index e38dc7a..070bc65 100644
--- a/mmdet3d/datasets/s3dis_dataset.py
+++ b/mmdet3d/datasets/s3dis_dataset.py
@@ -1,445 +1,445 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from os import path as osp
-
-import numpy as np
-
-from mmdet3d.core import show_seg_result
-from mmdet3d.core.bbox import DepthInstance3DBoxes
-from mmseg.datasets import DATASETS as SEG_DATASETS
-from .builder import DATASETS
-from .custom_3d import Custom3DDataset
-from .custom_3d_seg import Custom3DSegDataset
-from .pipelines import Compose
-
-
-@DATASETS.register_module()
-class S3DISDataset(Custom3DDataset):
-    r"""S3DIS Dataset for Detection Task.
-
-    This class is the inner dataset for S3DIS. Since S3DIS has 6 areas, we
-    often train on 5 of them and test on the remaining one. The one for
-    test is Area_5 as suggested in `GSDN <https://arxiv.org/abs/2006.12356>`_.
-    To concatenate 5 areas during training
-    `mmdet.datasets.dataset_wrappers.ConcatDataset` should be used.
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'Depth' in this dataset. Available options includes
-
-            - 'LiDAR': Box in LiDAR coordinates.
-            - 'Depth': Box in depth coordinates, usually for indoor dataset.
-            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-    """
-    CLASSES = ('table', 'chair', 'sofa', 'bookcase', 'board')
-
-    def __init__(self,
-                 data_root,
-                 ann_file,
-                 pipeline=None,
-                 classes=None,
-                 modality=None,
-                 box_type_3d='Depth',
-                 filter_empty_gt=True,
-                 test_mode=False,
-                 *kwargs):
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_file,
-            pipeline=pipeline,
-            classes=classes,
-            modality=modality,
-            box_type_3d=box_type_3d,
-            filter_empty_gt=filter_empty_gt,
-            test_mode=test_mode,
-            *kwargs)
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: annotation information consists of the following keys:
-
-                - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`):
-                    3D ground truth bboxes
-                - gt_labels_3d (np.ndarray): Labels of ground truths.
-                - pts_instance_mask_path (str): Path of instance masks.
-                - pts_semantic_mask_path (str): Path of semantic masks.
-        """
-        # Use index to get the annos, thus the evalhook could also use this api
-        info = self.data_infos[index]
-        if info['annos']['gt_num'] != 0:
-            gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype(
-                np.float32)  # k, 6
-            gt_labels_3d = info['annos']['class'].astype(np.int64)
-        else:
-            gt_bboxes_3d = np.zeros((0, 6), dtype=np.float32)
-            gt_labels_3d = np.zeros((0, ), dtype=np.int64)
-
-        # to target box structure
-        gt_bboxes_3d = DepthInstance3DBoxes(
-            gt_bboxes_3d,
-            box_dim=gt_bboxes_3d.shape[-1],
-            with_yaw=False,
-            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
-
-        pts_instance_mask_path = osp.join(self.data_root,
-                                          info['pts_instance_mask_path'])
-        pts_semantic_mask_path = osp.join(self.data_root,
-                                          info['pts_semantic_mask_path'])
-
-        anns_results = dict(
-            gt_bboxes_3d=gt_bboxes_3d,
-            gt_labels_3d=gt_labels_3d,
-            pts_instance_mask_path=pts_instance_mask_path,
-            pts_semantic_mask_path=pts_semantic_mask_path)
-        return anns_results
-
-    def get_data_info(self, index):
-        """Get data info according to the given index.
-
-        Args:
-            index (int): Index of the sample data to get.
-
-        Returns:
-            dict: Data information that will be passed to the data
-                preprocessing pipelines. It includes the following keys:
-
-                - pts_filename (str): Filename of point clouds.
-                - file_name (str): Filename of point clouds.
-                - ann_info (dict): Annotation info.
-        """
-        info = self.data_infos[index]
-        pts_filename = osp.join(self.data_root, info['pts_path'])
-        input_dict = dict(pts_filename=pts_filename)
-
-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-            if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
-                return None
-        return input_dict
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        pipeline = [
-            dict(
-                type='LoadPointsFromFile',
-                coord_type='DEPTH',
-                shift_height=False,
-                load_dim=6,
-                use_dim=[0, 1, 2, 3, 4, 5]),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=self.CLASSES,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ]
-        return Compose(pipeline)
-
-
-class _S3DISSegDataset(Custom3DSegDataset):
-    r"""S3DIS Dataset for Semantic Segmentation Task.
-
-    This class is the inner dataset for S3DIS. Since S3DIS has 6 areas, we
-    often train on 5 of them and test on the remaining one.
-    However, there is not a fixed train-test split of S3DIS. People often test
-    on Area_5 as suggested by `SEGCloud <https://arxiv.org/abs/1710.07563>`_.
-    But many papers also report the average results of 6-fold cross validation
-    over the 6 areas (e.g. `DGCNN <https://arxiv.org/abs/1801.07829>`_).
-    Therefore, we use an inner dataset for one area, and further use a dataset
-    wrapper to concat all the provided data in different areas.
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        palette (list[list[int]], optional): The palette of segmentation map.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-        ignore_index (int, optional): The label index to be ignored, e.g.
-            unannotated points. If None is given, set to len(self.CLASSES).
-            Defaults to None.
-        scene_idxs (np.ndarray | str, optional): Precomputed index to load
-            data. For scenes with many points, we may sample it several times.
-            Defaults to None.
-    """
-    CLASSES = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
-               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
-
-    VALID_CLASS_IDS = tuple(range(13))
-
-    ALL_CLASS_IDS = tuple(range(14))  # possibly with 'stair' class
-
-    PALETTE = [[0, 255, 0], [0, 0, 255], [0, 255, 255], [255, 255, 0],
-               [255, 0, 255], [100, 100, 255], [200, 200, 100],
-               [170, 120, 200], [255, 0, 0], [200, 100, 100], [10, 200, 100],
-               [200, 200, 200], [50, 50, 50]]
-
-    def __init__(self,
-                 data_root,
-                 ann_file,
-                 pipeline=None,
-                 classes=None,
-                 palette=None,
-                 modality=None,
-                 test_mode=False,
-                 ignore_index=None,
-                 scene_idxs=None,
-                 **kwargs):
-
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_file,
-            pipeline=pipeline,
-            classes=classes,
-            palette=palette,
-            modality=modality,
-            test_mode=test_mode,
-            ignore_index=ignore_index,
-            scene_idxs=scene_idxs,
-            **kwargs)
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: annotation information consists of the following keys:
-
-                - pts_semantic_mask_path (str): Path of semantic masks.
-        """
-        # Use index to get the annos, thus the evalhook could also use this api
-        info = self.data_infos[index]
-
-        pts_semantic_mask_path = osp.join(self.data_root,
-                                          info['pts_semantic_mask_path'])
-
-        anns_results = dict(pts_semantic_mask_path=pts_semantic_mask_path)
-        return anns_results
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        pipeline = [
-            dict(
-                type='LoadPointsFromFile',
-                coord_type='DEPTH',
-                shift_height=False,
-                use_color=True,
-                load_dim=6,
-                use_dim=[0, 1, 2, 3, 4, 5]),
-            dict(
-                type='LoadAnnotations3D',
-                with_bbox_3d=False,
-                with_label_3d=False,
-                with_mask_3d=False,
-                with_seg_3d=True),
-            dict(
-                type='PointSegClassMapping',
-                valid_cat_ids=self.VALID_CLASS_IDS,
-                max_cat_id=np.max(self.ALL_CLASS_IDS)),
-            dict(
-                type='DefaultFormatBundle3D',
-                with_label=False,
-                class_names=self.CLASSES),
-            dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
-        ]
-        return Compose(pipeline)
-
-    def show(self, results, out_dir, show=True, pipeline=None):
-        """Results visualization.
-
-        Args:
-            results (list[dict]): List of bounding boxes results.
-            out_dir (str): Output directory of visualization result.
-            show (bool): Visualize the results online.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-        """
-        assert out_dir is not None, 'Expect out_dir, got none.'
-        pipeline = self._get_pipeline(pipeline)
-        for i, result in enumerate(results):
-            data_info = self.data_infos[i]
-            pts_path = data_info['pts_path']
-            file_name = osp.split(pts_path)[-1].split('.')[0]
-            points, gt_sem_mask = self._extract_data(
-                i, pipeline, ['points', 'pts_semantic_mask'], load_annos=True)
-            points = points.numpy()
-            pred_sem_mask = result['semantic_mask'].numpy()
-            show_seg_result(points, gt_sem_mask,
-                            pred_sem_mask, out_dir, file_name,
-                            np.array(self.PALETTE), self.ignore_index, show)
-
-    def get_scene_idxs(self, scene_idxs):
-        """Compute scene_idxs for data sampling.
-
-        We sample more times for scenes with more points.
-        """
-        # when testing, we load one whole scene every time
-        if not self.test_mode and scene_idxs is None:
-            raise NotImplementedError(
-                'please provide re-sampled scene indexes for training')
-
-        return super().get_scene_idxs(scene_idxs)
-
-
-@DATASETS.register_module()
-@SEG_DATASETS.register_module()
-class S3DISSegDataset(_S3DISSegDataset):
-    r"""S3DIS Dataset for Semantic Segmentation Task.
-
-    This class serves as the API for experiments on the S3DIS Dataset.
-    It wraps the provided datasets of different areas.
-    We don't use `mmdet.datasets.dataset_wrappers.ConcatDataset` because we
-    need to concat the `scene_idxs` of different areas.
-
-    Please refer to the `google form <https://docs.google.com/forms/d/e/1FAIpQL
-    ScDimvNMCGhy_rmBA2gHfDu3naktRm6A8BPwAWWDv-Uhm6Shw/viewform?c=0&w=1>`_ for
-    data downloading.
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_files (list[str]): Path of several annotation files.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        palette (list[list[int]], optional): The palette of segmentation map.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-        ignore_index (int, optional): The label index to be ignored, e.g.
-            unannotated points. If None is given, set to len(self.CLASSES).
-            Defaults to None.
-        scene_idxs (list[np.ndarray] | list[str], optional): Precomputed index
-            to load data. For scenes with many points, we may sample it several
-            times. Defaults to None.
-    """
-
-    def __init__(self,
-                 data_root,
-                 ann_files,
-                 pipeline=None,
-                 classes=None,
-                 palette=None,
-                 modality=None,
-                 test_mode=False,
-                 ignore_index=None,
-                 scene_idxs=None,
-                 **kwargs):
-
-        # make sure that ann_files and scene_idxs have same length
-        ann_files = self._check_ann_files(ann_files)
-        scene_idxs = self._check_scene_idxs(scene_idxs, len(ann_files))
-
-        # initialize some attributes as datasets[0]
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_files[0],
-            pipeline=pipeline,
-            classes=classes,
-            palette=palette,
-            modality=modality,
-            test_mode=test_mode,
-            ignore_index=ignore_index,
-            scene_idxs=scene_idxs[0],
-            **kwargs)
-
-        datasets = [
-            _S3DISSegDataset(
-                data_root=data_root,
-                ann_file=ann_files[i],
-                pipeline=pipeline,
-                classes=classes,
-                palette=palette,
-                modality=modality,
-                test_mode=test_mode,
-                ignore_index=ignore_index,
-                scene_idxs=scene_idxs[i],
-                **kwargs) for i in range(len(ann_files))
-        ]
-
-        # data_infos and scene_idxs need to be concat
-        self.concat_data_infos([dst.data_infos for dst in datasets])
-        self.concat_scene_idxs([dst.scene_idxs for dst in datasets])
-
-        # set group flag for the sampler
-        if not self.test_mode:
-            self._set_group_flag()
-
-    def concat_data_infos(self, data_infos):
-        """Concat data_infos from several datasets to form self.data_infos.
-
-        Args:
-            data_infos (list[list[dict]])
-        """
-        self.data_infos = [
-            info for one_data_infos in data_infos for info in one_data_infos
-        ]
-
-    def concat_scene_idxs(self, scene_idxs):
-        """Concat scene_idxs from several datasets to form self.scene_idxs.
-
-        Needs to manually add offset to scene_idxs[1, 2, ...].
-
-        Args:
-            scene_idxs (list[np.ndarray])
-        """
-        self.scene_idxs = np.array([], dtype=np.int32)
-        offset = 0
-        for one_scene_idxs in scene_idxs:
-            self.scene_idxs = np.concatenate(
-                [self.scene_idxs, one_scene_idxs + offset]).astype(np.int32)
-            offset = np.unique(self.scene_idxs).max() + 1
-
-    @staticmethod
-    def _duplicate_to_list(x, num):
-        """Repeat x `num` times to form a list."""
-        return [x for _ in range(num)]
-
-    def _check_ann_files(self, ann_file):
-        """Make ann_files as list/tuple."""
-        # ann_file could be str
-        if not isinstance(ann_file, (list, tuple)):
-            ann_file = self._duplicate_to_list(ann_file, 1)
-        return ann_file
-
-    def _check_scene_idxs(self, scene_idx, num):
-        """Make scene_idxs as list/tuple."""
-        if scene_idx is None:
-            return self._duplicate_to_list(scene_idx, num)
-        # scene_idx could be str, np.ndarray, list or tuple
-        if isinstance(scene_idx, str):  # str
-            return self._duplicate_to_list(scene_idx, num)
-        if isinstance(scene_idx[0], str):  # list of str
-            return scene_idx
-        if isinstance(scene_idx[0], (list, tuple, np.ndarray)):  # list of idx
-            return scene_idx
-        # single idx
-        return self._duplicate_to_list(scene_idx, num)
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+
+import numpy as np
+
+from mmdet3d.core import show_seg_result
+from mmdet3d.core.bbox import DepthInstance3DBoxes
+from mmseg.datasets import DATASETS as SEG_DATASETS
+from .builder import DATASETS
+from .custom_3d import Custom3DDataset
+from .custom_3d_seg import Custom3DSegDataset
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class S3DISDataset(Custom3DDataset):
+    r"""S3DIS Dataset for Detection Task.
+
+    This class is the inner dataset for S3DIS. Since S3DIS has 6 areas, we
+    often train on 5 of them and test on the remaining one. The one for
+    test is Area_5 as suggested in `GSDN <https://arxiv.org/abs/2006.12356>`_.
+    To concatenate 5 areas during training
+    `mmdet.datasets.dataset_wrappers.ConcatDataset` should be used.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Depth' in this dataset. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    CLASSES = ('table', 'chair', 'sofa', 'bookcase', 'board')
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 pipeline=None,
+                 classes=None,
+                 modality=None,
+                 box_type_3d='Depth',
+                 filter_empty_gt=True,
+                 test_mode=False,
+                 *kwargs):
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            *kwargs)
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`):
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - pts_instance_mask_path (str): Path of instance masks.
+                - pts_semantic_mask_path (str): Path of semantic masks.
+        """
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+        if info['annos']['gt_num'] != 0:
+            gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype(
+                np.float32)  # k, 6
+            gt_labels_3d = info['annos']['class'].astype(np.int64)
+        else:
+            gt_bboxes_3d = np.zeros((0, 6), dtype=np.float32)
+            gt_labels_3d = np.zeros((0, ), dtype=np.int64)
+
+        # to target box structure
+        gt_bboxes_3d = DepthInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            with_yaw=False,
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        pts_instance_mask_path = osp.join(self.data_root,
+                                          info['pts_instance_mask_path'])
+        pts_semantic_mask_path = osp.join(self.data_root,
+                                          info['pts_semantic_mask_path'])
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            pts_instance_mask_path=pts_instance_mask_path,
+            pts_semantic_mask_path=pts_semantic_mask_path)
+        return anns_results
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data
+                preprocessing pipelines. It includes the following keys:
+
+                - pts_filename (str): Filename of point clouds.
+                - file_name (str): Filename of point clouds.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        pts_filename = osp.join(self.data_root, info['pts_path'])
+        input_dict = dict(pts_filename=pts_filename)
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+            if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
+                return None
+        return input_dict
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='DEPTH',
+                shift_height=False,
+                load_dim=6,
+                use_dim=[0, 1, 2, 3, 4, 5]),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ]
+        return Compose(pipeline)
+
+
+class _S3DISSegDataset(Custom3DSegDataset):
+    r"""S3DIS Dataset for Semantic Segmentation Task.
+
+    This class is the inner dataset for S3DIS. Since S3DIS has 6 areas, we
+    often train on 5 of them and test on the remaining one.
+    However, there is not a fixed train-test split of S3DIS. People often test
+    on Area_5 as suggested by `SEGCloud <https://arxiv.org/abs/1710.07563>`_.
+    But many papers also report the average results of 6-fold cross validation
+    over the 6 areas (e.g. `DGCNN <https://arxiv.org/abs/1801.07829>`_).
+    Therefore, we use an inner dataset for one area, and further use a dataset
+    wrapper to concat all the provided data in different areas.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        palette (list[list[int]], optional): The palette of segmentation map.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.CLASSES).
+            Defaults to None.
+        scene_idxs (np.ndarray | str, optional): Precomputed index to load
+            data. For scenes with many points, we may sample it several times.
+            Defaults to None.
+    """
+    CLASSES = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+
+    VALID_CLASS_IDS = tuple(range(13))
+
+    ALL_CLASS_IDS = tuple(range(14))  # possibly with 'stair' class
+
+    PALETTE = [[0, 255, 0], [0, 0, 255], [0, 255, 255], [255, 255, 0],
+               [255, 0, 255], [100, 100, 255], [200, 200, 100],
+               [170, 120, 200], [255, 0, 0], [200, 100, 100], [10, 200, 100],
+               [200, 200, 200], [50, 50, 50]]
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 pipeline=None,
+                 classes=None,
+                 palette=None,
+                 modality=None,
+                 test_mode=False,
+                 ignore_index=None,
+                 scene_idxs=None,
+                 **kwargs):
+
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            palette=palette,
+            modality=modality,
+            test_mode=test_mode,
+            ignore_index=ignore_index,
+            scene_idxs=scene_idxs,
+            **kwargs)
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: annotation information consists of the following keys:
+
+                - pts_semantic_mask_path (str): Path of semantic masks.
+        """
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+
+        pts_semantic_mask_path = osp.join(self.data_root,
+                                          info['pts_semantic_mask_path'])
+
+        anns_results = dict(pts_semantic_mask_path=pts_semantic_mask_path)
+        return anns_results
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='DEPTH',
+                shift_height=False,
+                use_color=True,
+                load_dim=6,
+                use_dim=[0, 1, 2, 3, 4, 5]),
+            dict(
+                type='LoadAnnotations3D',
+                with_bbox_3d=False,
+                with_label_3d=False,
+                with_mask_3d=False,
+                with_seg_3d=True),
+            dict(
+                type='PointSegClassMapping',
+                valid_cat_ids=self.VALID_CLASS_IDS,
+                max_cat_id=np.max(self.ALL_CLASS_IDS)),
+            dict(
+                type='DefaultFormatBundle3D',
+                with_label=False,
+                class_names=self.CLASSES),
+            dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+        ]
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=True, pipeline=None):
+        """Results visualization.
+
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Visualize the results online.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            data_info = self.data_infos[i]
+            pts_path = data_info['pts_path']
+            file_name = osp.split(pts_path)[-1].split('.')[0]
+            points, gt_sem_mask = self._extract_data(
+                i, pipeline, ['points', 'pts_semantic_mask'], load_annos=True)
+            points = points.numpy()
+            pred_sem_mask = result['semantic_mask'].numpy()
+            show_seg_result(points, gt_sem_mask,
+                            pred_sem_mask, out_dir, file_name,
+                            np.array(self.PALETTE), self.ignore_index, show)
+
+    def get_scene_idxs(self, scene_idxs):
+        """Compute scene_idxs for data sampling.
+
+        We sample more times for scenes with more points.
+        """
+        # when testing, we load one whole scene every time
+        if not self.test_mode and scene_idxs is None:
+            raise NotImplementedError(
+                'please provide re-sampled scene indexes for training')
+
+        return super().get_scene_idxs(scene_idxs)
+
+
+@DATASETS.register_module()
+@SEG_DATASETS.register_module()
+class S3DISSegDataset(_S3DISSegDataset):
+    r"""S3DIS Dataset for Semantic Segmentation Task.
+
+    This class serves as the API for experiments on the S3DIS Dataset.
+    It wraps the provided datasets of different areas.
+    We don't use `mmdet.datasets.dataset_wrappers.ConcatDataset` because we
+    need to concat the `scene_idxs` of different areas.
+
+    Please refer to the `google form <https://docs.google.com/forms/d/e/1FAIpQL
+    ScDimvNMCGhy_rmBA2gHfDu3naktRm6A8BPwAWWDv-Uhm6Shw/viewform?c=0&w=1>`_ for
+    data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_files (list[str]): Path of several annotation files.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        palette (list[list[int]], optional): The palette of segmentation map.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.CLASSES).
+            Defaults to None.
+        scene_idxs (list[np.ndarray] | list[str], optional): Precomputed index
+            to load data. For scenes with many points, we may sample it several
+            times. Defaults to None.
+    """
+
+    def __init__(self,
+                 data_root,
+                 ann_files,
+                 pipeline=None,
+                 classes=None,
+                 palette=None,
+                 modality=None,
+                 test_mode=False,
+                 ignore_index=None,
+                 scene_idxs=None,
+                 **kwargs):
+
+        # make sure that ann_files and scene_idxs have same length
+        ann_files = self._check_ann_files(ann_files)
+        scene_idxs = self._check_scene_idxs(scene_idxs, len(ann_files))
+
+        # initialize some attributes as datasets[0]
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_files[0],
+            pipeline=pipeline,
+            classes=classes,
+            palette=palette,
+            modality=modality,
+            test_mode=test_mode,
+            ignore_index=ignore_index,
+            scene_idxs=scene_idxs[0],
+            **kwargs)
+
+        datasets = [
+            _S3DISSegDataset(
+                data_root=data_root,
+                ann_file=ann_files[i],
+                pipeline=pipeline,
+                classes=classes,
+                palette=palette,
+                modality=modality,
+                test_mode=test_mode,
+                ignore_index=ignore_index,
+                scene_idxs=scene_idxs[i],
+                **kwargs) for i in range(len(ann_files))
+        ]
+
+        # data_infos and scene_idxs need to be concat
+        self.concat_data_infos([dst.data_infos for dst in datasets])
+        self.concat_scene_idxs([dst.scene_idxs for dst in datasets])
+
+        # set group flag for the sampler
+        if not self.test_mode:
+            self._set_group_flag()
+
+    def concat_data_infos(self, data_infos):
+        """Concat data_infos from several datasets to form self.data_infos.
+
+        Args:
+            data_infos (list[list[dict]])
+        """
+        self.data_infos = [
+            info for one_data_infos in data_infos for info in one_data_infos
+        ]
+
+    def concat_scene_idxs(self, scene_idxs):
+        """Concat scene_idxs from several datasets to form self.scene_idxs.
+
+        Needs to manually add offset to scene_idxs[1, 2, ...].
+
+        Args:
+            scene_idxs (list[np.ndarray])
+        """
+        self.scene_idxs = np.array([], dtype=np.int32)
+        offset = 0
+        for one_scene_idxs in scene_idxs:
+            self.scene_idxs = np.concatenate(
+                [self.scene_idxs, one_scene_idxs + offset]).astype(np.int32)
+            offset = np.unique(self.scene_idxs).max() + 1
+
+    @staticmethod
+    def _duplicate_to_list(x, num):
+        """Repeat x `num` times to form a list."""
+        return [x for _ in range(num)]
+
+    def _check_ann_files(self, ann_file):
+        """Make ann_files as list/tuple."""
+        # ann_file could be str
+        if not isinstance(ann_file, (list, tuple)):
+            ann_file = self._duplicate_to_list(ann_file, 1)
+        return ann_file
+
+    def _check_scene_idxs(self, scene_idx, num):
+        """Make scene_idxs as list/tuple."""
+        if scene_idx is None:
+            return self._duplicate_to_list(scene_idx, num)
+        # scene_idx could be str, np.ndarray, list or tuple
+        if isinstance(scene_idx, str):  # str
+            return self._duplicate_to_list(scene_idx, num)
+        if isinstance(scene_idx[0], str):  # list of str
+            return scene_idx
+        if isinstance(scene_idx[0], (list, tuple, np.ndarray)):  # list of idx
+            return scene_idx
+        # single idx
+        return self._duplicate_to_list(scene_idx, num)
diff --git a/mmdet3d/datasets/scannet_dataset.py b/mmdet3d/datasets/scannet_dataset.py
index 3e69126..2141876 100644
--- a/mmdet3d/datasets/scannet_dataset.py
+++ b/mmdet3d/datasets/scannet_dataset.py
@@ -1,614 +1,614 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import tempfile
-import warnings
-from os import path as osp
-
-import numpy as np
-
-from mmdet3d.core import instance_seg_eval, show_result, show_seg_result
-from mmdet3d.core.bbox import DepthInstance3DBoxes
-from mmseg.datasets import DATASETS as SEG_DATASETS
-from .builder import DATASETS
-from .custom_3d import Custom3DDataset
-from .custom_3d_seg import Custom3DSegDataset
-from .pipelines import Compose
-
-
-@DATASETS.register_module()
-class ScanNetDataset(Custom3DDataset):
-    r"""ScanNet Dataset for Detection Task.
-
-    This class serves as the API for experiments on the ScanNet Dataset.
-
-    Please refer to the `github repo <https://github.com/ScanNet/ScanNet>`_
-    for data downloading.
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'Depth' in this dataset. Available options includes
-
-            - 'LiDAR': Box in LiDAR coordinates.
-            - 'Depth': Box in depth coordinates, usually for indoor dataset.
-            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-    """
-    CLASSES = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
-               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
-               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
-               'garbagebin')
-
-    def __init__(self,
-                 data_root,
-                 ann_file,
-                 pipeline=None,
-                 classes=None,
-                 modality=dict(use_camera=False, use_depth=True),
-                 box_type_3d='Depth',
-                 filter_empty_gt=True,
-                 test_mode=False,
-                 **kwargs):
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_file,
-            pipeline=pipeline,
-            classes=classes,
-            modality=modality,
-            box_type_3d=box_type_3d,
-            filter_empty_gt=filter_empty_gt,
-            test_mode=test_mode,
-            **kwargs)
-        assert 'use_camera' in self.modality and \
-               'use_depth' in self.modality
-        assert self.modality['use_camera'] or self.modality['use_depth']
-
-    def get_data_info(self, index):
-        """Get data info according to the given index.
-
-        Args:
-            index (int): Index of the sample data to get.
-
-        Returns:
-            dict: Data information that will be passed to the data
-                preprocessing pipelines. It includes the following keys:
-
-                - sample_idx (str): Sample index.
-                - pts_filename (str): Filename of point clouds.
-                - file_name (str): Filename of point clouds.
-                - img_prefix (str, optional): Prefix of image files.
-                - img_info (dict, optional): Image info.
-                - ann_info (dict): Annotation info.
-        """
-        info = self.data_infos[index]
-        sample_idx = info['point_cloud']['lidar_idx']
-        pts_filename = osp.join(self.data_root, info['pts_path'])
-        input_dict = dict(sample_idx=sample_idx)
-
-        if self.modality['use_depth']:
-            input_dict['pts_filename'] = pts_filename
-            input_dict['file_name'] = pts_filename
-
-        if self.modality['use_camera']:
-            img_info = []
-            for img_path in info['img_paths']:
-                img_info.append(
-                    dict(filename=osp.join(self.data_root, img_path)))
-            intrinsic = info['intrinsics']
-            axis_align_matrix = self._get_axis_align_matrix(info)
-            depth2img = []
-            for extrinsic in info['extrinsics']:
-                depth2img.append(
-                    intrinsic @ np.linalg.inv(axis_align_matrix @ extrinsic))
-
-            input_dict['img_prefix'] = None
-            input_dict['img_info'] = img_info
-            input_dict['depth2img'] = depth2img
-
-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-            if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
-                return None
-        return input_dict
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: annotation information consists of the following keys:
-
-                - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`):
-                    3D ground truth bboxes
-                - gt_labels_3d (np.ndarray): Labels of ground truths.
-                - pts_instance_mask_path (str): Path of instance masks.
-                - pts_semantic_mask_path (str): Path of semantic masks.
-                - axis_align_matrix (np.ndarray): Transformation matrix for
-                    global scene alignment.
-        """
-        # Use index to get the annos, thus the evalhook could also use this api
-        info = self.data_infos[index]
-        if info['annos']['gt_num'] != 0:
-            gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype(
-                np.float32)  # k, 6
-            gt_labels_3d = info['annos']['class'].astype(np.int64)
-        else:
-            gt_bboxes_3d = np.zeros((0, 6), dtype=np.float32)
-            gt_labels_3d = np.zeros((0, ), dtype=np.int64)
-
-        # to target box structure
-        gt_bboxes_3d = DepthInstance3DBoxes(
-            gt_bboxes_3d,
-            box_dim=gt_bboxes_3d.shape[-1],
-            with_yaw=False,
-            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
-
-        pts_instance_mask_path = osp.join(self.data_root,
-                                          info['pts_instance_mask_path'])
-        pts_semantic_mask_path = osp.join(self.data_root,
-                                          info['pts_semantic_mask_path'])
-
-        axis_align_matrix = self._get_axis_align_matrix(info)
-
-        anns_results = dict(
-            gt_bboxes_3d=gt_bboxes_3d,
-            gt_labels_3d=gt_labels_3d,
-            pts_instance_mask_path=pts_instance_mask_path,
-            pts_semantic_mask_path=pts_semantic_mask_path,
-            axis_align_matrix=axis_align_matrix)
-        return anns_results
-
-    def prepare_test_data(self, index):
-        """Prepare data for testing.
-
-        We should take axis_align_matrix from self.data_infos since we need
-            to align point clouds.
-
-        Args:
-            index (int): Index for accessing the target data.
-
-        Returns:
-            dict: Testing data dict of the corresponding index.
-        """
-        input_dict = self.get_data_info(index)
-        # take the axis_align_matrix from data_infos
-        input_dict['ann_info'] = dict(
-            axis_align_matrix=self._get_axis_align_matrix(
-                self.data_infos[index]))
-        self.pre_pipeline(input_dict)
-        example = self.pipeline(input_dict)
-        return example
-
-    @staticmethod
-    def _get_axis_align_matrix(info):
-        """Get axis_align_matrix from info. If not exist, return identity mat.
-
-        Args:
-            info (dict): one data info term.
-
-        Returns:
-            np.ndarray: 4x4 transformation matrix.
-        """
-        if 'axis_align_matrix' in info['annos'].keys():
-            return info['annos']['axis_align_matrix'].astype(np.float32)
-        else:
-            warnings.warn(
-                'axis_align_matrix is not found in ScanNet data info, please '
-                'use new pre-process scripts to re-generate ScanNet data')
-            return np.eye(4).astype(np.float32)
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        pipeline = [
-            dict(
-                type='LoadPointsFromFile',
-                coord_type='DEPTH',
-                shift_height=False,
-                load_dim=6,
-                use_dim=[0, 1, 2]),
-            dict(type='GlobalAlignment', rotation_axis=2),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=self.CLASSES,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ]
-        return Compose(pipeline)
-
-    def show(self, results, out_dir, show=True, pipeline=None):
-        """Results visualization.
-
-        Args:
-            results (list[dict]): List of bounding boxes results.
-            out_dir (str): Output directory of visualization result.
-            show (bool): Visualize the results online.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-        """
-        assert out_dir is not None, 'Expect out_dir, got none.'
-        pipeline = self._get_pipeline(pipeline)
-        for i, result in enumerate(results):
-            data_info = self.data_infos[i]
-            pts_path = data_info['pts_path']
-            file_name = osp.split(pts_path)[-1].split('.')[0]
-            points = self._extract_data(i, pipeline, 'points').numpy()
-            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
-            pred_bboxes = result['boxes_3d'].tensor.numpy()
-            show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name,
-                        show)
-
-
-@DATASETS.register_module()
-@SEG_DATASETS.register_module()
-class ScanNetSegDataset(Custom3DSegDataset):
-    r"""ScanNet Dataset for Semantic Segmentation Task.
-
-    This class serves as the API for experiments on the ScanNet Dataset.
-
-    Please refer to the `github repo <https://github.com/ScanNet/ScanNet>`_
-    for data downloading.
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        palette (list[list[int]], optional): The palette of segmentation map.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-        ignore_index (int, optional): The label index to be ignored, e.g.
-            unannotated points. If None is given, set to len(self.CLASSES).
-            Defaults to None.
-        scene_idxs (np.ndarray | str, optional): Precomputed index to load
-            data. For scenes with many points, we may sample it several times.
-            Defaults to None.
-    """
-    CLASSES = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
-               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
-               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
-               'bathtub', 'otherfurniture')
-
-    VALID_CLASS_IDS = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
-                       33, 34, 36, 39)
-
-    ALL_CLASS_IDS = tuple(range(41))
-
-    PALETTE = [
-        [174, 199, 232],
-        [152, 223, 138],
-        [31, 119, 180],
-        [255, 187, 120],
-        [188, 189, 34],
-        [140, 86, 75],
-        [255, 152, 150],
-        [214, 39, 40],
-        [197, 176, 213],
-        [148, 103, 189],
-        [196, 156, 148],
-        [23, 190, 207],
-        [247, 182, 210],
-        [219, 219, 141],
-        [255, 127, 14],
-        [158, 218, 229],
-        [44, 160, 44],
-        [112, 128, 144],
-        [227, 119, 194],
-        [82, 84, 163],
-    ]
-
-    def __init__(self,
-                 data_root,
-                 ann_file,
-                 pipeline=None,
-                 classes=None,
-                 palette=None,
-                 modality=None,
-                 test_mode=False,
-                 ignore_index=None,
-                 scene_idxs=None,
-                 **kwargs):
-
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_file,
-            pipeline=pipeline,
-            classes=classes,
-            palette=palette,
-            modality=modality,
-            test_mode=test_mode,
-            ignore_index=ignore_index,
-            scene_idxs=scene_idxs,
-            **kwargs)
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: annotation information consists of the following keys:
-
-                - pts_semantic_mask_path (str): Path of semantic masks.
-        """
-        # Use index to get the annos, thus the evalhook could also use this api
-        info = self.data_infos[index]
-
-        pts_semantic_mask_path = osp.join(self.data_root,
-                                          info['pts_semantic_mask_path'])
-
-        anns_results = dict(pts_semantic_mask_path=pts_semantic_mask_path)
-        return anns_results
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        pipeline = [
-            dict(
-                type='LoadPointsFromFile',
-                coord_type='DEPTH',
-                shift_height=False,
-                use_color=True,
-                load_dim=6,
-                use_dim=[0, 1, 2, 3, 4, 5]),
-            dict(
-                type='LoadAnnotations3D',
-                with_bbox_3d=False,
-                with_label_3d=False,
-                with_mask_3d=False,
-                with_seg_3d=True),
-            dict(
-                type='PointSegClassMapping',
-                valid_cat_ids=self.VALID_CLASS_IDS,
-                max_cat_id=np.max(self.ALL_CLASS_IDS)),
-            dict(
-                type='DefaultFormatBundle3D',
-                with_label=False,
-                class_names=self.CLASSES),
-            dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
-        ]
-        return Compose(pipeline)
-
-    def show(self, results, out_dir, show=True, pipeline=None):
-        """Results visualization.
-
-        Args:
-            results (list[dict]): List of bounding boxes results.
-            out_dir (str): Output directory of visualization result.
-            show (bool): Visualize the results online.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-        """
-        assert out_dir is not None, 'Expect out_dir, got none.'
-        pipeline = self._get_pipeline(pipeline)
-        for i, result in enumerate(results):
-            data_info = self.data_infos[i]
-            pts_path = data_info['pts_path']
-            file_name = osp.split(pts_path)[-1].split('.')[0]
-            points, gt_sem_mask = self._extract_data(
-                i, pipeline, ['points', 'pts_semantic_mask'], load_annos=True)
-            points = points.numpy()
-            pred_sem_mask = result['semantic_mask'].numpy()
-            show_seg_result(points, gt_sem_mask,
-                            pred_sem_mask, out_dir, file_name,
-                            np.array(self.PALETTE), self.ignore_index, show)
-
-    def get_scene_idxs(self, scene_idxs):
-        """Compute scene_idxs for data sampling.
-
-        We sample more times for scenes with more points.
-        """
-        # when testing, we load one whole scene every time
-        if not self.test_mode and scene_idxs is None:
-            raise NotImplementedError(
-                'please provide re-sampled scene indexes for training')
-
-        return super().get_scene_idxs(scene_idxs)
-
-    def format_results(self, results, txtfile_prefix=None):
-        r"""Format the results to txt file. Refer to `ScanNet documentation
-        <http://kaldir.vc.in.tum.de/scannet_benchmark/documentation>`_.
-
-        Args:
-            outputs (list[dict]): Testing results of the dataset.
-            txtfile_prefix (str): The prefix of saved files. It includes
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-
-        Returns:
-            tuple: (outputs, tmp_dir), outputs is the detection results,
-                tmp_dir is the temporal directory created for saving submission
-                files when ``submission_prefix`` is not specified.
-        """
-        import mmcv
-
-        if txtfile_prefix is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            txtfile_prefix = osp.join(tmp_dir.name, 'results')
-        else:
-            tmp_dir = None
-        mmcv.mkdir_or_exist(txtfile_prefix)
-
-        # need to map network output to original label idx
-        pred2label = np.zeros(len(self.VALID_CLASS_IDS)).astype(np.int)
-        for original_label, output_idx in self.label_map.items():
-            if output_idx != self.ignore_index:
-                pred2label[output_idx] = original_label
-
-        outputs = []
-        for i, result in enumerate(results):
-            info = self.data_infos[i]
-            sample_idx = info['point_cloud']['lidar_idx']
-            pred_sem_mask = result['semantic_mask'].numpy().astype(np.int)
-            pred_label = pred2label[pred_sem_mask]
-            curr_file = f'{txtfile_prefix}/{sample_idx}.txt'
-            np.savetxt(curr_file, pred_label, fmt='%d')
-            outputs.append(dict(seg_mask=pred_label))
-
-        return outputs, tmp_dir
-
-
-@DATASETS.register_module()
-@SEG_DATASETS.register_module()
-class ScanNetInstanceSegDataset(Custom3DSegDataset):
-    CLASSES = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
-               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
-               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
-               'garbagebin')
-
-    VALID_CLASS_IDS = (3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
-                       36, 39)
-
-    ALL_CLASS_IDS = tuple(range(41))
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: annotation information consists of the following keys:
-                - pts_semantic_mask_path (str): Path of semantic masks.
-                - pts_instance_mask_path (str): Path of instance masks.
-        """
-        # Use index to get the annos, thus the evalhook could also use this api
-        info = self.data_infos[index]
-
-        pts_instance_mask_path = osp.join(self.data_root,
-                                          info['pts_instance_mask_path'])
-        pts_semantic_mask_path = osp.join(self.data_root,
-                                          info['pts_semantic_mask_path'])
-
-        anns_results = dict(
-            pts_instance_mask_path=pts_instance_mask_path,
-            pts_semantic_mask_path=pts_semantic_mask_path)
-        return anns_results
-
-    def get_classes_and_palette(self, classes=None, palette=None):
-        """Get class names of current dataset. Palette is simply ignored for
-        instance segmentation.
-
-        Args:
-            classes (Sequence[str] | str | None): If classes is None, use
-                default CLASSES defined by builtin dataset. If classes is a
-                string, take it as a file name. The file contains the name of
-                classes where each line contains one class name. If classes is
-                a tuple or list, override the CLASSES defined by the dataset.
-                Defaults to None.
-            palette (Sequence[Sequence[int]]] | np.ndarray | None):
-                The palette of segmentation map. If None is given, random
-                palette will be generated. Defaults to None.
-        """
-        if classes is not None:
-            return classes, None
-        return self.CLASSES, None
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        pipeline = [
-            dict(
-                type='LoadPointsFromFile',
-                coord_type='DEPTH',
-                shift_height=False,
-                use_color=True,
-                load_dim=6,
-                use_dim=[0, 1, 2, 3, 4, 5]),
-            dict(
-                type='LoadAnnotations3D',
-                with_bbox_3d=False,
-                with_label_3d=False,
-                with_mask_3d=True,
-                with_seg_3d=True),
-            dict(
-                type='PointSegClassMapping',
-                valid_cat_ids=self.VALID_CLASS_IDS,
-                max_cat_id=40),
-            dict(
-                type='DefaultFormatBundle3D',
-                with_label=False,
-                class_names=self.CLASSES),
-            dict(
-                type='Collect3D',
-                keys=['points', 'pts_semantic_mask', 'pts_instance_mask'])
-        ]
-        return Compose(pipeline)
-
-    def evaluate(self,
-                 results,
-                 metric=None,
-                 options=None,
-                 logger=None,
-                 show=False,
-                 out_dir=None,
-                 pipeline=None):
-        """Evaluation in instance segmentation protocol.
-
-        Args:
-            results (list[dict]): List of results.
-            metric (str | list[str]): Metrics to be evaluated.
-            options (dict, optional): options for instance_seg_eval.
-            logger (logging.Logger | None | str): Logger used for printing
-                related information during evaluation. Defaults to None.
-            show (bool, optional): Whether to visualize.
-                Defaults to False.
-            out_dir (str, optional): Path to save the visualization results.
-                Defaults to None.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-
-        Returns:
-            dict: Evaluation results.
-        """
-        assert isinstance(
-            results, list), f'Expect results to be list, got {type(results)}.'
-        assert len(results) > 0, 'Expect length of results > 0.'
-        assert len(results) == len(self.data_infos)
-        assert isinstance(
-            results[0], dict
-        ), f'Expect elements in results to be dict, got {type(results[0])}.'
-
-        load_pipeline = self._get_pipeline(pipeline)
-        pred_instance_masks = [result['instance_mask'] for result in results]
-        pred_instance_labels = [result['instance_label'] for result in results]
-        pred_instance_scores = [result['instance_score'] for result in results]
-        gt_semantic_masks, gt_instance_masks = zip(*[
-            self._extract_data(
-                index=i,
-                pipeline=load_pipeline,
-                key=['pts_semantic_mask', 'pts_instance_mask'],
-                load_annos=True) for i in range(len(self.data_infos))
-        ])
-        ret_dict = instance_seg_eval(
-            gt_semantic_masks,
-            gt_instance_masks,
-            pred_instance_masks,
-            pred_instance_labels,
-            pred_instance_scores,
-            valid_class_ids=self.VALID_CLASS_IDS,
-            class_labels=self.CLASSES,
-            options=options,
-            logger=logger)
-
-        if show:
-            raise NotImplementedError('show is not implemented for now')
-
-        return ret_dict
+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+import warnings
+from os import path as osp
+
+import numpy as np
+
+from mmdet3d.core import instance_seg_eval, show_result, show_seg_result
+from mmdet3d.core.bbox import DepthInstance3DBoxes
+from mmseg.datasets import DATASETS as SEG_DATASETS
+from .builder import DATASETS
+from .custom_3d import Custom3DDataset
+from .custom_3d_seg import Custom3DSegDataset
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class ScanNetDataset(Custom3DDataset):
+    r"""ScanNet Dataset for Detection Task.
+
+    This class serves as the API for experiments on the ScanNet Dataset.
+
+    Please refer to the `github repo <https://github.com/ScanNet/ScanNet>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Depth' in this dataset. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    CLASSES = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 pipeline=None,
+                 classes=None,
+                 modality=dict(use_camera=False, use_depth=True),
+                 box_type_3d='Depth',
+                 filter_empty_gt=True,
+                 test_mode=False,
+                 **kwargs):
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+        assert 'use_camera' in self.modality and \
+               'use_depth' in self.modality
+        assert self.modality['use_camera'] or self.modality['use_depth']
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - file_name (str): Filename of point clouds.
+                - img_prefix (str, optional): Prefix of image files.
+                - img_info (dict, optional): Image info.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        sample_idx = info['point_cloud']['lidar_idx']
+        pts_filename = osp.join(self.data_root, info['pts_path'])
+        input_dict = dict(sample_idx=sample_idx)
+
+        if self.modality['use_depth']:
+            input_dict['pts_filename'] = pts_filename
+            input_dict['file_name'] = pts_filename
+
+        if self.modality['use_camera']:
+            img_info = []
+            for img_path in info['img_paths']:
+                img_info.append(
+                    dict(filename=osp.join(self.data_root, img_path)))
+            intrinsic = info['intrinsics']
+            axis_align_matrix = self._get_axis_align_matrix(info)
+            depth2img = []
+            for extrinsic in info['extrinsics']:
+                depth2img.append(
+                    intrinsic @ np.linalg.inv(axis_align_matrix @ extrinsic))
+
+            input_dict['img_prefix'] = None
+            input_dict['img_info'] = img_info
+            input_dict['depth2img'] = depth2img
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+            if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
+                return None
+        return input_dict
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`):
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - pts_instance_mask_path (str): Path of instance masks.
+                - pts_semantic_mask_path (str): Path of semantic masks.
+                - axis_align_matrix (np.ndarray): Transformation matrix for
+                    global scene alignment.
+        """
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+        if info['annos']['gt_num'] != 0:
+            gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype(
+                np.float32)  # k, 6
+            gt_labels_3d = info['annos']['class'].astype(np.int64)
+        else:
+            gt_bboxes_3d = np.zeros((0, 6), dtype=np.float32)
+            gt_labels_3d = np.zeros((0, ), dtype=np.int64)
+
+        # to target box structure
+        gt_bboxes_3d = DepthInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            with_yaw=False,
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        pts_instance_mask_path = osp.join(self.data_root,
+                                          info['pts_instance_mask_path'])
+        pts_semantic_mask_path = osp.join(self.data_root,
+                                          info['pts_semantic_mask_path'])
+
+        axis_align_matrix = self._get_axis_align_matrix(info)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            pts_instance_mask_path=pts_instance_mask_path,
+            pts_semantic_mask_path=pts_semantic_mask_path,
+            axis_align_matrix=axis_align_matrix)
+        return anns_results
+
+    def prepare_test_data(self, index):
+        """Prepare data for testing.
+
+        We should take axis_align_matrix from self.data_infos since we need
+            to align point clouds.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Testing data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        # take the axis_align_matrix from data_infos
+        input_dict['ann_info'] = dict(
+            axis_align_matrix=self._get_axis_align_matrix(
+                self.data_infos[index]))
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        return example
+
+    @staticmethod
+    def _get_axis_align_matrix(info):
+        """Get axis_align_matrix from info. If not exist, return identity mat.
+
+        Args:
+            info (dict): one data info term.
+
+        Returns:
+            np.ndarray: 4x4 transformation matrix.
+        """
+        if 'axis_align_matrix' in info['annos'].keys():
+            return info['annos']['axis_align_matrix'].astype(np.float32)
+        else:
+            warnings.warn(
+                'axis_align_matrix is not found in ScanNet data info, please '
+                'use new pre-process scripts to re-generate ScanNet data')
+            return np.eye(4).astype(np.float32)
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='DEPTH',
+                shift_height=False,
+                load_dim=6,
+                use_dim=[0, 1, 2]),
+            dict(type='GlobalAlignment', rotation_axis=2),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ]
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=True, pipeline=None):
+        """Results visualization.
+
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Visualize the results online.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            data_info = self.data_infos[i]
+            pts_path = data_info['pts_path']
+            file_name = osp.split(pts_path)[-1].split('.')[0]
+            points = self._extract_data(i, pipeline, 'points').numpy()
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
+            pred_bboxes = result['boxes_3d'].tensor.numpy()
+            show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name,
+                        show)
+
+
+@DATASETS.register_module()
+@SEG_DATASETS.register_module()
+class ScanNetSegDataset(Custom3DSegDataset):
+    r"""ScanNet Dataset for Semantic Segmentation Task.
+
+    This class serves as the API for experiments on the ScanNet Dataset.
+
+    Please refer to the `github repo <https://github.com/ScanNet/ScanNet>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        palette (list[list[int]], optional): The palette of segmentation map.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+        ignore_index (int, optional): The label index to be ignored, e.g.
+            unannotated points. If None is given, set to len(self.CLASSES).
+            Defaults to None.
+        scene_idxs (np.ndarray | str, optional): Precomputed index to load
+            data. For scenes with many points, we may sample it several times.
+            Defaults to None.
+    """
+    CLASSES = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+               'bathtub', 'otherfurniture')
+
+    VALID_CLASS_IDS = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+                       33, 34, 36, 39)
+
+    ALL_CLASS_IDS = tuple(range(41))
+
+    PALETTE = [
+        [174, 199, 232],
+        [152, 223, 138],
+        [31, 119, 180],
+        [255, 187, 120],
+        [188, 189, 34],
+        [140, 86, 75],
+        [255, 152, 150],
+        [214, 39, 40],
+        [197, 176, 213],
+        [148, 103, 189],
+        [196, 156, 148],
+        [23, 190, 207],
+        [247, 182, 210],
+        [219, 219, 141],
+        [255, 127, 14],
+        [158, 218, 229],
+        [44, 160, 44],
+        [112, 128, 144],
+        [227, 119, 194],
+        [82, 84, 163],
+    ]
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 pipeline=None,
+                 classes=None,
+                 palette=None,
+                 modality=None,
+                 test_mode=False,
+                 ignore_index=None,
+                 scene_idxs=None,
+                 **kwargs):
+
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            palette=palette,
+            modality=modality,
+            test_mode=test_mode,
+            ignore_index=ignore_index,
+            scene_idxs=scene_idxs,
+            **kwargs)
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: annotation information consists of the following keys:
+
+                - pts_semantic_mask_path (str): Path of semantic masks.
+        """
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+
+        pts_semantic_mask_path = osp.join(self.data_root,
+                                          info['pts_semantic_mask_path'])
+
+        anns_results = dict(pts_semantic_mask_path=pts_semantic_mask_path)
+        return anns_results
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='DEPTH',
+                shift_height=False,
+                use_color=True,
+                load_dim=6,
+                use_dim=[0, 1, 2, 3, 4, 5]),
+            dict(
+                type='LoadAnnotations3D',
+                with_bbox_3d=False,
+                with_label_3d=False,
+                with_mask_3d=False,
+                with_seg_3d=True),
+            dict(
+                type='PointSegClassMapping',
+                valid_cat_ids=self.VALID_CLASS_IDS,
+                max_cat_id=np.max(self.ALL_CLASS_IDS)),
+            dict(
+                type='DefaultFormatBundle3D',
+                with_label=False,
+                class_names=self.CLASSES),
+            dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+        ]
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=True, pipeline=None):
+        """Results visualization.
+
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Visualize the results online.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            data_info = self.data_infos[i]
+            pts_path = data_info['pts_path']
+            file_name = osp.split(pts_path)[-1].split('.')[0]
+            points, gt_sem_mask = self._extract_data(
+                i, pipeline, ['points', 'pts_semantic_mask'], load_annos=True)
+            points = points.numpy()
+            pred_sem_mask = result['semantic_mask'].numpy()
+            show_seg_result(points, gt_sem_mask,
+                            pred_sem_mask, out_dir, file_name,
+                            np.array(self.PALETTE), self.ignore_index, show)
+
+    def get_scene_idxs(self, scene_idxs):
+        """Compute scene_idxs for data sampling.
+
+        We sample more times for scenes with more points.
+        """
+        # when testing, we load one whole scene every time
+        if not self.test_mode and scene_idxs is None:
+            raise NotImplementedError(
+                'please provide re-sampled scene indexes for training')
+
+        return super().get_scene_idxs(scene_idxs)
+
+    def format_results(self, results, txtfile_prefix=None):
+        r"""Format the results to txt file. Refer to `ScanNet documentation
+        <http://kaldir.vc.in.tum.de/scannet_benchmark/documentation>`_.
+
+        Args:
+            outputs (list[dict]): Testing results of the dataset.
+            txtfile_prefix (str): The prefix of saved files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (outputs, tmp_dir), outputs is the detection results,
+                tmp_dir is the temporal directory created for saving submission
+                files when ``submission_prefix`` is not specified.
+        """
+        import mmcv
+
+        if txtfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            txtfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+        mmcv.mkdir_or_exist(txtfile_prefix)
+
+        # need to map network output to original label idx
+        pred2label = np.zeros(len(self.VALID_CLASS_IDS)).astype(np.int)
+        for original_label, output_idx in self.label_map.items():
+            if output_idx != self.ignore_index:
+                pred2label[output_idx] = original_label
+
+        outputs = []
+        for i, result in enumerate(results):
+            info = self.data_infos[i]
+            sample_idx = info['point_cloud']['lidar_idx']
+            pred_sem_mask = result['semantic_mask'].numpy().astype(np.int)
+            pred_label = pred2label[pred_sem_mask]
+            curr_file = f'{txtfile_prefix}/{sample_idx}.txt'
+            np.savetxt(curr_file, pred_label, fmt='%d')
+            outputs.append(dict(seg_mask=pred_label))
+
+        return outputs, tmp_dir
+
+
+@DATASETS.register_module()
+@SEG_DATASETS.register_module()
+class ScanNetInstanceSegDataset(Custom3DSegDataset):
+    CLASSES = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+
+    VALID_CLASS_IDS = (3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
+                       36, 39)
+
+    ALL_CLASS_IDS = tuple(range(41))
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: annotation information consists of the following keys:
+                - pts_semantic_mask_path (str): Path of semantic masks.
+                - pts_instance_mask_path (str): Path of instance masks.
+        """
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+
+        pts_instance_mask_path = osp.join(self.data_root,
+                                          info['pts_instance_mask_path'])
+        pts_semantic_mask_path = osp.join(self.data_root,
+                                          info['pts_semantic_mask_path'])
+
+        anns_results = dict(
+            pts_instance_mask_path=pts_instance_mask_path,
+            pts_semantic_mask_path=pts_semantic_mask_path)
+        return anns_results
+
+    def get_classes_and_palette(self, classes=None, palette=None):
+        """Get class names of current dataset. Palette is simply ignored for
+        instance segmentation.
+
+        Args:
+            classes (Sequence[str] | str | None): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+                Defaults to None.
+            palette (Sequence[Sequence[int]]] | np.ndarray | None):
+                The palette of segmentation map. If None is given, random
+                palette will be generated. Defaults to None.
+        """
+        if classes is not None:
+            return classes, None
+        return self.CLASSES, None
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='DEPTH',
+                shift_height=False,
+                use_color=True,
+                load_dim=6,
+                use_dim=[0, 1, 2, 3, 4, 5]),
+            dict(
+                type='LoadAnnotations3D',
+                with_bbox_3d=False,
+                with_label_3d=False,
+                with_mask_3d=True,
+                with_seg_3d=True),
+            dict(
+                type='PointSegClassMapping',
+                valid_cat_ids=self.VALID_CLASS_IDS,
+                max_cat_id=40),
+            dict(
+                type='DefaultFormatBundle3D',
+                with_label=False,
+                class_names=self.CLASSES),
+            dict(
+                type='Collect3D',
+                keys=['points', 'pts_semantic_mask', 'pts_instance_mask'])
+        ]
+        return Compose(pipeline)
+
+    def evaluate(self,
+                 results,
+                 metric=None,
+                 options=None,
+                 logger=None,
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in instance segmentation protocol.
+
+        Args:
+            results (list[dict]): List of results.
+            metric (str | list[str]): Metrics to be evaluated.
+            options (dict, optional): options for instance_seg_eval.
+            logger (logging.Logger | None | str): Logger used for printing
+                related information during evaluation. Defaults to None.
+            show (bool, optional): Whether to visualize.
+                Defaults to False.
+            out_dir (str, optional): Path to save the visualization results.
+                Defaults to None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict: Evaluation results.
+        """
+        assert isinstance(
+            results, list), f'Expect results to be list, got {type(results)}.'
+        assert len(results) > 0, 'Expect length of results > 0.'
+        assert len(results) == len(self.data_infos)
+        assert isinstance(
+            results[0], dict
+        ), f'Expect elements in results to be dict, got {type(results[0])}.'
+
+        load_pipeline = self._get_pipeline(pipeline)
+        pred_instance_masks = [result['instance_mask'] for result in results]
+        pred_instance_labels = [result['instance_label'] for result in results]
+        pred_instance_scores = [result['instance_score'] for result in results]
+        gt_semantic_masks, gt_instance_masks = zip(*[
+            self._extract_data(
+                index=i,
+                pipeline=load_pipeline,
+                key=['pts_semantic_mask', 'pts_instance_mask'],
+                load_annos=True) for i in range(len(self.data_infos))
+        ])
+        ret_dict = instance_seg_eval(
+            gt_semantic_masks,
+            gt_instance_masks,
+            pred_instance_masks,
+            pred_instance_labels,
+            pred_instance_scores,
+            valid_class_ids=self.VALID_CLASS_IDS,
+            class_labels=self.CLASSES,
+            options=options,
+            logger=logger)
+
+        if show:
+            raise NotImplementedError('show is not implemented for now')
+
+        return ret_dict
diff --git a/mmdet3d/datasets/semantickitti_dataset.py b/mmdet3d/datasets/semantickitti_dataset.py
index 03afbe0..c9dc2ab 100644
--- a/mmdet3d/datasets/semantickitti_dataset.py
+++ b/mmdet3d/datasets/semantickitti_dataset.py
@@ -1,110 +1,110 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from os import path as osp
-
-from .builder import DATASETS
-from .custom_3d import Custom3DDataset
-
-
-@DATASETS.register_module()
-class SemanticKITTIDataset(Custom3DDataset):
-    r"""SemanticKITTI Dataset.
-
-    This class serves as the API for experiments on the SemanticKITTI Dataset
-    Please refer to <http://www.semantic-kitti.org/dataset.html>`_
-    for data downloading
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): NO 3D box for this dataset.
-            You can choose any type
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR' in this dataset. Available options includes
-
-            - 'LiDAR': Box in LiDAR coordinates.
-            - 'Depth': Box in depth coordinates, usually for indoor dataset.
-            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-    """
-    CLASSES = ('unlabeled', 'car', 'bicycle', 'motorcycle', 'truck', 'bus',
-               'person', 'bicyclist', 'motorcyclist', 'road', 'parking',
-               'sidewalk', 'other-ground', 'building', 'fence', 'vegetation',
-               'trunck', 'terrian', 'pole', 'traffic-sign')
-
-    def __init__(self,
-                 data_root,
-                 ann_file,
-                 pipeline=None,
-                 classes=None,
-                 modality=None,
-                 box_type_3d='Lidar',
-                 filter_empty_gt=False,
-                 test_mode=False):
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_file,
-            pipeline=pipeline,
-            classes=classes,
-            modality=modality,
-            box_type_3d=box_type_3d,
-            filter_empty_gt=filter_empty_gt,
-            test_mode=test_mode)
-
-    def get_data_info(self, index):
-        """Get data info according to the given index.
-        Args:
-            index (int): Index of the sample data to get.
-
-        Returns:
-            dict: Data information that will be passed to the data
-                preprocessing pipelines. It includes the following keys:
-                - sample_idx (str): Sample index.
-                - pts_filename (str): Filename of point clouds.
-                - file_name (str): Filename of point clouds.
-                - ann_info (dict): Annotation info.
-        """
-        info = self.data_infos[index]
-        sample_idx = info['point_cloud']['lidar_idx']
-        pts_filename = osp.join(self.data_root, info['pts_path'])
-
-        input_dict = dict(
-            pts_filename=pts_filename,
-            sample_idx=sample_idx,
-            file_name=pts_filename)
-
-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-            if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
-                return None
-        return input_dict
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: annotation information consists of the following keys:
-
-                - pts_semantic_mask_path (str): Path of semantic masks.
-        """
-        # Use index to get the annos, thus the evalhook could also use this api
-        info = self.data_infos[index]
-
-        pts_semantic_mask_path = osp.join(self.data_root,
-                                          info['pts_semantic_mask_path'])
-
-        anns_results = dict(pts_semantic_mask_path=pts_semantic_mask_path)
-        return anns_results
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+
+from .builder import DATASETS
+from .custom_3d import Custom3DDataset
+
+
+@DATASETS.register_module()
+class SemanticKITTIDataset(Custom3DDataset):
+    r"""SemanticKITTI Dataset.
+
+    This class serves as the API for experiments on the SemanticKITTI Dataset
+    Please refer to <http://www.semantic-kitti.org/dataset.html>`_
+    for data downloading
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): NO 3D box for this dataset.
+            You can choose any type
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    CLASSES = ('unlabeled', 'car', 'bicycle', 'motorcycle', 'truck', 'bus',
+               'person', 'bicyclist', 'motorcyclist', 'road', 'parking',
+               'sidewalk', 'other-ground', 'building', 'fence', 'vegetation',
+               'trunck', 'terrian', 'pole', 'traffic-sign')
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 pipeline=None,
+                 classes=None,
+                 modality=None,
+                 box_type_3d='Lidar',
+                 filter_empty_gt=False,
+                 test_mode=False):
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode)
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data
+                preprocessing pipelines. It includes the following keys:
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - file_name (str): Filename of point clouds.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        sample_idx = info['point_cloud']['lidar_idx']
+        pts_filename = osp.join(self.data_root, info['pts_path'])
+
+        input_dict = dict(
+            pts_filename=pts_filename,
+            sample_idx=sample_idx,
+            file_name=pts_filename)
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+            if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
+                return None
+        return input_dict
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: annotation information consists of the following keys:
+
+                - pts_semantic_mask_path (str): Path of semantic masks.
+        """
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+
+        pts_semantic_mask_path = osp.join(self.data_root,
+                                          info['pts_semantic_mask_path'])
+
+        anns_results = dict(pts_semantic_mask_path=pts_semantic_mask_path)
+        return anns_results
diff --git a/mmdet3d/datasets/sunrgbd_dataset.py b/mmdet3d/datasets/sunrgbd_dataset.py
index 623ab88..b67c1fc 100644
--- a/mmdet3d/datasets/sunrgbd_dataset.py
+++ b/mmdet3d/datasets/sunrgbd_dataset.py
@@ -1,280 +1,280 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from collections import OrderedDict
-from os import path as osp
-
-import numpy as np
-
-from mmdet3d.core import show_multi_modality_result, show_result
-from mmdet3d.core.bbox import DepthInstance3DBoxes
-from mmdet.core import eval_map
-from .builder import DATASETS
-from .custom_3d import Custom3DDataset
-from .pipelines import Compose
-
-
-@DATASETS.register_module()
-class SUNRGBDDataset(Custom3DDataset):
-    r"""SUNRGBD Dataset.
-
-    This class serves as the API for experiments on the SUNRGBD Dataset.
-
-    See the `download page <http://rgbd.cs.princeton.edu/challenge.html>`_
-    for data downloading.
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'Depth' in this dataset. Available options includes
-
-            - 'LiDAR': Box in LiDAR coordinates.
-            - 'Depth': Box in depth coordinates, usually for indoor dataset.
-            - 'Camera': Box in camera coordinates.
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-    """
-    CLASSES = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
-               'night_stand', 'bookshelf', 'bathtub')
-
-    def __init__(self,
-                 data_root,
-                 ann_file,
-                 pipeline=None,
-                 classes=None,
-                 modality=dict(use_camera=True, use_lidar=True),
-                 box_type_3d='Depth',
-                 filter_empty_gt=True,
-                 test_mode=False,
-                 **kwargs):
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_file,
-            pipeline=pipeline,
-            classes=classes,
-            modality=modality,
-            box_type_3d=box_type_3d,
-            filter_empty_gt=filter_empty_gt,
-            test_mode=test_mode,
-            **kwargs)
-        assert 'use_camera' in self.modality and \
-            'use_lidar' in self.modality
-        assert self.modality['use_camera'] or self.modality['use_lidar']
-
-    def get_data_info(self, index):
-        """Get data info according to the given index.
-
-        Args:
-            index (int): Index of the sample data to get.
-
-        Returns:
-            dict: Data information that will be passed to the data
-                preprocessing pipelines. It includes the following keys:
-
-                - sample_idx (str): Sample index.
-                - pts_filename (str, optional): Filename of point clouds.
-                - file_name (str, optional): Filename of point clouds.
-                - img_prefix (str, optional): Prefix of image files.
-                - img_info (dict, optional): Image info.
-                - calib (dict, optional): Camera calibration info.
-                - ann_info (dict): Annotation info.
-        """
-        info = self.data_infos[index]
-        sample_idx = info['point_cloud']['lidar_idx']
-        assert info['point_cloud']['lidar_idx'] == info['image']['image_idx']
-        input_dict = dict(sample_idx=sample_idx)
-
-        if self.modality['use_lidar']:
-            pts_filename = osp.join(self.data_root, info['pts_path'])
-            input_dict['pts_filename'] = pts_filename
-            input_dict['file_name'] = pts_filename
-
-        if self.modality['use_camera']:
-            img_filename = osp.join(
-                osp.join(self.data_root, 'sunrgbd_trainval'),
-                info['image']['image_path'])
-            input_dict['img_prefix'] = None
-            input_dict['img_info'] = dict(filename=img_filename)
-            calib = info['calib']
-            rt_mat = calib['Rt']
-            # follow Coord3DMode.convert_point
-            rt_mat = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]
-                               ]) @ rt_mat.transpose(1, 0)
-            depth2img = calib['K'] @ rt_mat
-            input_dict['depth2img'] = depth2img
-
-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-            if self.filter_empty_gt and len(annos['gt_bboxes_3d']) == 0:
-                return None
-        return input_dict
-
-    def get_ann_info(self, index):
-        """Get annotation info according to the given index.
-
-        Args:
-            index (int): Index of the annotation data to get.
-
-        Returns:
-            dict: annotation information consists of the following keys:
-
-                - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`):
-                    3D ground truth bboxes
-                - gt_labels_3d (np.ndarray): Labels of ground truths.
-                - pts_instance_mask_path (str): Path of instance masks.
-                - pts_semantic_mask_path (str): Path of semantic masks.
-        """
-        # Use index to get the annos, thus the evalhook could also use this api
-        info = self.data_infos[index]
-        if info['annos']['gt_num'] != 0:
-            gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype(
-                np.float32)  # k, 6
-            gt_labels_3d = info['annos']['class'].astype(np.int64)
-        else:
-            gt_bboxes_3d = np.zeros((0, 7), dtype=np.float32)
-            gt_labels_3d = np.zeros((0, ), dtype=np.int64)
-
-        # to target box structure
-        gt_bboxes_3d = DepthInstance3DBoxes(
-            gt_bboxes_3d, origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
-
-        anns_results = dict(
-            gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d)
-
-        if self.modality['use_camera']:
-            if info['annos']['gt_num'] != 0:
-                gt_bboxes_2d = info['annos']['bbox'].astype(np.float32)
-            else:
-                gt_bboxes_2d = np.zeros((0, 4), dtype=np.float32)
-            anns_results['bboxes'] = gt_bboxes_2d
-            anns_results['labels'] = gt_labels_3d
-
-        return anns_results
-
-    def _build_default_pipeline(self):
-        """Build the default pipeline for this dataset."""
-        pipeline = [
-            dict(
-                type='LoadPointsFromFile',
-                coord_type='DEPTH',
-                shift_height=False,
-                load_dim=6,
-                use_dim=[0, 1, 2]),
-            dict(
-                type='DefaultFormatBundle3D',
-                class_names=self.CLASSES,
-                with_label=False),
-            dict(type='Collect3D', keys=['points'])
-        ]
-        if self.modality['use_camera']:
-            pipeline.insert(0, dict(type='LoadImageFromFile'))
-        return Compose(pipeline)
-
-    def show(self, results, out_dir, show=True, pipeline=None):
-        """Results visualization.
-
-        Args:
-            results (list[dict]): List of bounding boxes results.
-            out_dir (str): Output directory of visualization result.
-            show (bool): Visualize the results online.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-        """
-        assert out_dir is not None, 'Expect out_dir, got none.'
-        pipeline = self._get_pipeline(pipeline)
-        for i, result in enumerate(results):
-            data_info = self.data_infos[i]
-            pts_path = data_info['pts_path']
-            file_name = osp.split(pts_path)[-1].split('.')[0]
-            points, img_metas, img = self._extract_data(
-                i, pipeline, ['points', 'img_metas', 'img'])
-            # scale colors to [0, 255]
-            points = points.numpy()
-            points[:, 3:] *= 255
-
-            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
-            pred_bboxes = result['boxes_3d'].tensor.numpy()
-            show_result(points, gt_bboxes.copy(), pred_bboxes.copy(), out_dir,
-                        file_name, show)
-
-            # multi-modality visualization
-            if self.modality['use_camera']:
-                img = img.numpy()
-                # need to transpose channel to first dim
-                img = img.transpose(1, 2, 0)
-                pred_bboxes = DepthInstance3DBoxes(
-                    pred_bboxes, origin=(0.5, 0.5, 0))
-                gt_bboxes = DepthInstance3DBoxes(
-                    gt_bboxes, origin=(0.5, 0.5, 0))
-                show_multi_modality_result(
-                    img,
-                    gt_bboxes,
-                    pred_bboxes,
-                    None,
-                    out_dir,
-                    file_name,
-                    box_mode='depth',
-                    img_metas=img_metas,
-                    show=show)
-
-    def evaluate(self,
-                 results,
-                 metric=None,
-                 iou_thr=(0.25, 0.5),
-                 iou_thr_2d=(0.5, ),
-                 logger=None,
-                 show=False,
-                 out_dir=None,
-                 pipeline=None):
-        """Evaluate.
-
-        Evaluation in indoor protocol.
-
-        Args:
-            results (list[dict]): List of results.
-            metric (str | list[str], optional): Metrics to be evaluated.
-                Default: None.
-            iou_thr (list[float], optional): AP IoU thresholds for 3D
-                evaluation. Default: (0.25, 0.5).
-            iou_thr_2d (list[float], optional): AP IoU thresholds for 2D
-                evaluation. Default: (0.5, ).
-            show (bool, optional): Whether to visualize.
-                Default: False.
-            out_dir (str, optional): Path to save the visualization results.
-                Default: None.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-
-        Returns:
-            dict: Evaluation results.
-        """
-        # evaluate 3D detection performance
-        if isinstance(results[0], dict):
-            return super().evaluate(results, metric, iou_thr, logger, show,
-                                    out_dir, pipeline)
-        # evaluate 2D detection performance
-        else:
-            eval_results = OrderedDict()
-            annotations = [self.get_ann_info(i) for i in range(len(self))]
-            iou_thr_2d = (iou_thr_2d) if isinstance(iou_thr_2d,
-                                                    float) else iou_thr_2d
-            for iou_thr_2d_single in iou_thr_2d:
-                mean_ap, _ = eval_map(
-                    results,
-                    annotations,
-                    scale_ranges=None,
-                    iou_thr=iou_thr_2d_single,
-                    dataset=self.CLASSES,
-                    logger=logger)
-                eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
-            return eval_results
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from os import path as osp
+
+import numpy as np
+
+from mmdet3d.core import show_multi_modality_result, show_result
+from mmdet3d.core.bbox import DepthInstance3DBoxes
+from mmdet.core import eval_map
+from .builder import DATASETS
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class SUNRGBDDataset(Custom3DDataset):
+    r"""SUNRGBD Dataset.
+
+    This class serves as the API for experiments on the SUNRGBD Dataset.
+
+    See the `download page <http://rgbd.cs.princeton.edu/challenge.html>`_
+    for data downloading.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Depth' in this dataset. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+    CLASSES = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 pipeline=None,
+                 classes=None,
+                 modality=dict(use_camera=True, use_lidar=True),
+                 box_type_3d='Depth',
+                 filter_empty_gt=True,
+                 test_mode=False,
+                 **kwargs):
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            **kwargs)
+        assert 'use_camera' in self.modality and \
+            'use_lidar' in self.modality
+        assert self.modality['use_camera'] or self.modality['use_lidar']
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str, optional): Filename of point clouds.
+                - file_name (str, optional): Filename of point clouds.
+                - img_prefix (str, optional): Prefix of image files.
+                - img_info (dict, optional): Image info.
+                - calib (dict, optional): Camera calibration info.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        sample_idx = info['point_cloud']['lidar_idx']
+        assert info['point_cloud']['lidar_idx'] == info['image']['image_idx']
+        input_dict = dict(sample_idx=sample_idx)
+
+        if self.modality['use_lidar']:
+            pts_filename = osp.join(self.data_root, info['pts_path'])
+            input_dict['pts_filename'] = pts_filename
+            input_dict['file_name'] = pts_filename
+
+        if self.modality['use_camera']:
+            img_filename = osp.join(
+                osp.join(self.data_root, 'sunrgbd_trainval'),
+                info['image']['image_path'])
+            input_dict['img_prefix'] = None
+            input_dict['img_info'] = dict(filename=img_filename)
+            calib = info['calib']
+            rt_mat = calib['Rt']
+            # follow Coord3DMode.convert_point
+            rt_mat = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]
+                               ]) @ rt_mat.transpose(1, 0)
+            depth2img = calib['K'] @ rt_mat
+            input_dict['depth2img'] = depth2img
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+            if self.filter_empty_gt and len(annos['gt_bboxes_3d']) == 0:
+                return None
+        return input_dict
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`):
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - pts_instance_mask_path (str): Path of instance masks.
+                - pts_semantic_mask_path (str): Path of semantic masks.
+        """
+        # Use index to get the annos, thus the evalhook could also use this api
+        info = self.data_infos[index]
+        if info['annos']['gt_num'] != 0:
+            gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype(
+                np.float32)  # k, 6
+            gt_labels_3d = info['annos']['class'].astype(np.int64)
+        else:
+            gt_bboxes_3d = np.zeros((0, 7), dtype=np.float32)
+            gt_labels_3d = np.zeros((0, ), dtype=np.int64)
+
+        # to target box structure
+        gt_bboxes_3d = DepthInstance3DBoxes(
+            gt_bboxes_3d, origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d)
+
+        if self.modality['use_camera']:
+            if info['annos']['gt_num'] != 0:
+                gt_bboxes_2d = info['annos']['bbox'].astype(np.float32)
+            else:
+                gt_bboxes_2d = np.zeros((0, 4), dtype=np.float32)
+            anns_results['bboxes'] = gt_bboxes_2d
+            anns_results['labels'] = gt_labels_3d
+
+        return anns_results
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='DEPTH',
+                shift_height=False,
+                load_dim=6,
+                use_dim=[0, 1, 2]),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ]
+        if self.modality['use_camera']:
+            pipeline.insert(0, dict(type='LoadImageFromFile'))
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=True, pipeline=None):
+        """Results visualization.
+
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Visualize the results online.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            data_info = self.data_infos[i]
+            pts_path = data_info['pts_path']
+            file_name = osp.split(pts_path)[-1].split('.')[0]
+            points, img_metas, img = self._extract_data(
+                i, pipeline, ['points', 'img_metas', 'img'])
+            # scale colors to [0, 255]
+            points = points.numpy()
+            points[:, 3:] *= 255
+
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
+            pred_bboxes = result['boxes_3d'].tensor.numpy()
+            show_result(points, gt_bboxes.copy(), pred_bboxes.copy(), out_dir,
+                        file_name, show)
+
+            # multi-modality visualization
+            if self.modality['use_camera']:
+                img = img.numpy()
+                # need to transpose channel to first dim
+                img = img.transpose(1, 2, 0)
+                pred_bboxes = DepthInstance3DBoxes(
+                    pred_bboxes, origin=(0.5, 0.5, 0))
+                gt_bboxes = DepthInstance3DBoxes(
+                    gt_bboxes, origin=(0.5, 0.5, 0))
+                show_multi_modality_result(
+                    img,
+                    gt_bboxes,
+                    pred_bboxes,
+                    None,
+                    out_dir,
+                    file_name,
+                    box_mode='depth',
+                    img_metas=img_metas,
+                    show=show)
+
+    def evaluate(self,
+                 results,
+                 metric=None,
+                 iou_thr=(0.25, 0.5),
+                 iou_thr_2d=(0.5, ),
+                 logger=None,
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluate.
+
+        Evaluation in indoor protocol.
+
+        Args:
+            results (list[dict]): List of results.
+            metric (str | list[str], optional): Metrics to be evaluated.
+                Default: None.
+            iou_thr (list[float], optional): AP IoU thresholds for 3D
+                evaluation. Default: (0.25, 0.5).
+            iou_thr_2d (list[float], optional): AP IoU thresholds for 2D
+                evaluation. Default: (0.5, ).
+            show (bool, optional): Whether to visualize.
+                Default: False.
+            out_dir (str, optional): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict: Evaluation results.
+        """
+        # evaluate 3D detection performance
+        if isinstance(results[0], dict):
+            return super().evaluate(results, metric, iou_thr, logger, show,
+                                    out_dir, pipeline)
+        # evaluate 2D detection performance
+        else:
+            eval_results = OrderedDict()
+            annotations = [self.get_ann_info(i) for i in range(len(self))]
+            iou_thr_2d = (iou_thr_2d) if isinstance(iou_thr_2d,
+                                                    float) else iou_thr_2d
+            for iou_thr_2d_single in iou_thr_2d:
+                mean_ap, _ = eval_map(
+                    results,
+                    annotations,
+                    scale_ranges=None,
+                    iou_thr=iou_thr_2d_single,
+                    dataset=self.CLASSES,
+                    logger=logger)
+                eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
+            return eval_results
diff --git a/mmdet3d/datasets/utils.py b/mmdet3d/datasets/utils.py
index e9cfda1..bdd9af3 100644
--- a/mmdet3d/datasets/utils.py
+++ b/mmdet3d/datasets/utils.py
@@ -1,140 +1,140 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
-
-# yapf: disable
-from mmdet3d.datasets.pipelines import (Collect3D, DefaultFormatBundle3D,
-                                        LoadAnnotations3D,
-                                        LoadImageFromFileMono3D,
-                                        LoadMultiViewImageFromFiles,
-                                        LoadPointsFromFile,
-                                        LoadPointsFromMultiSweeps,
-                                        MultiScaleFlipAug3D,
-                                        PointSegClassMapping)
-from mmdet.datasets.pipelines import LoadImageFromFile, MultiScaleFlipAug
-# yapf: enable
-from .builder import PIPELINES
-
-
-def is_loading_function(transform):
-    """Judge whether a transform function is a loading function.
-
-    Note: `MultiScaleFlipAug3D` is a wrapper for multiple pipeline functions,
-    so we need to search if its inner transforms contain any loading function.
-
-    Args:
-        transform (dict | :obj:`Pipeline`): A transform config or a function.
-
-    Returns:
-        bool: Whether it is a loading function. None means can't judge.
-            When transform is `MultiScaleFlipAug3D`, we return None.
-    """
-    # TODO: use more elegant way to distinguish loading modules
-    loading_functions = (LoadImageFromFile, LoadPointsFromFile,
-                         LoadAnnotations3D, LoadMultiViewImageFromFiles,
-                         LoadPointsFromMultiSweeps, DefaultFormatBundle3D,
-                         Collect3D, LoadImageFromFileMono3D,
-                         PointSegClassMapping)
-    if isinstance(transform, dict):
-        obj_cls = PIPELINES.get(transform['type'])
-        if obj_cls is None:
-            return False
-        if obj_cls in loading_functions:
-            return True
-        if obj_cls in (MultiScaleFlipAug3D, MultiScaleFlipAug):
-            return None
-    elif callable(transform):
-        if isinstance(transform, loading_functions):
-            return True
-        if isinstance(transform, (MultiScaleFlipAug3D, MultiScaleFlipAug)):
-            return None
-    return False
-
-
-def get_loading_pipeline(pipeline):
-    """Only keep loading image, points and annotations related configuration.
-
-    Args:
-        pipeline (list[dict] | list[:obj:`Pipeline`]):
-            Data pipeline configs or list of pipeline functions.
-
-    Returns:
-        list[dict] | list[:obj:`Pipeline`]): The new pipeline list with only
-            keep loading image, points and annotations related configuration.
-
-    Examples:
-        >>> pipelines = [
-        ...    dict(type='LoadPointsFromFile',
-        ...         coord_type='LIDAR', load_dim=4, use_dim=4),
-        ...    dict(type='LoadImageFromFile'),
-        ...    dict(type='LoadAnnotations3D',
-        ...         with_bbox=True, with_label_3d=True),
-        ...    dict(type='Resize',
-        ...         img_scale=[(640, 192), (2560, 768)], keep_ratio=True),
-        ...    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
-        ...    dict(type='PointsRangeFilter',
-        ...         point_cloud_range=point_cloud_range),
-        ...    dict(type='ObjectRangeFilter',
-        ...         point_cloud_range=point_cloud_range),
-        ...    dict(type='PointShuffle'),
-        ...    dict(type='Normalize', **img_norm_cfg),
-        ...    dict(type='Pad', size_divisor=32),
-        ...    dict(type='DefaultFormatBundle3D', class_names=class_names),
-        ...    dict(type='Collect3D',
-        ...         keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
-        ...    ]
-        >>> expected_pipelines = [
-        ...    dict(type='LoadPointsFromFile',
-        ...         coord_type='LIDAR', load_dim=4, use_dim=4),
-        ...    dict(type='LoadImageFromFile'),
-        ...    dict(type='LoadAnnotations3D',
-        ...         with_bbox=True, with_label_3d=True),
-        ...    dict(type='DefaultFormatBundle3D', class_names=class_names),
-        ...    dict(type='Collect3D',
-        ...         keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
-        ...    ]
-        >>> assert expected_pipelines == \
-        ...        get_loading_pipeline(pipelines)
-    """
-    loading_pipeline = []
-    for transform in pipeline:
-        is_loading = is_loading_function(transform)
-        if is_loading is None:  # MultiScaleFlipAug3D
-            # extract its inner pipeline
-            if isinstance(transform, dict):
-                inner_pipeline = transform.get('transforms', [])
-            else:
-                inner_pipeline = transform.transforms.transforms
-            loading_pipeline.extend(get_loading_pipeline(inner_pipeline))
-        elif is_loading:
-            loading_pipeline.append(transform)
-    assert len(loading_pipeline) > 0, \
-        'The data pipeline in your config file must include ' \
-        'loading step.'
-    return loading_pipeline
-
-
-def extract_result_dict(results, key):
-    """Extract and return the data corresponding to key in result dict.
-
-    ``results`` is a dict output from `pipeline(input_dict)`, which is the
-        loaded data from ``Dataset`` class.
-    The data terms inside may be wrapped in list, tuple and DataContainer, so
-        this function essentially extracts data from these wrappers.
-
-    Args:
-        results (dict): Data loaded using pipeline.
-        key (str): Key of the desired data.
-
-    Returns:
-        np.ndarray | torch.Tensor: Data term.
-    """
-    if key not in results.keys():
-        return None
-    # results[key] may be data or list[data] or tuple[data]
-    # data may be wrapped inside DataContainer
-    data = results[key]
-    if isinstance(data, (list, tuple)):
-        data = data[0]
-    if isinstance(data, mmcv.parallel.DataContainer):
-        data = data._data
-    return data
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+
+# yapf: disable
+from mmdet3d.datasets.pipelines import (Collect3D, DefaultFormatBundle3D,
+                                        LoadAnnotations3D,
+                                        LoadImageFromFileMono3D,
+                                        LoadMultiViewImageFromFiles,
+                                        LoadPointsFromFile,
+                                        LoadPointsFromMultiSweeps,
+                                        MultiScaleFlipAug3D,
+                                        PointSegClassMapping)
+from mmdet.datasets.pipelines import LoadImageFromFile, MultiScaleFlipAug
+# yapf: enable
+from .builder import PIPELINES
+
+
+def is_loading_function(transform):
+    """Judge whether a transform function is a loading function.
+
+    Note: `MultiScaleFlipAug3D` is a wrapper for multiple pipeline functions,
+    so we need to search if its inner transforms contain any loading function.
+
+    Args:
+        transform (dict | :obj:`Pipeline`): A transform config or a function.
+
+    Returns:
+        bool: Whether it is a loading function. None means can't judge.
+            When transform is `MultiScaleFlipAug3D`, we return None.
+    """
+    # TODO: use more elegant way to distinguish loading modules
+    loading_functions = (LoadImageFromFile, LoadPointsFromFile,
+                         LoadAnnotations3D, LoadMultiViewImageFromFiles,
+                         LoadPointsFromMultiSweeps, DefaultFormatBundle3D,
+                         Collect3D, LoadImageFromFileMono3D,
+                         PointSegClassMapping)
+    if isinstance(transform, dict):
+        obj_cls = PIPELINES.get(transform['type'])
+        if obj_cls is None:
+            return False
+        if obj_cls in loading_functions:
+            return True
+        if obj_cls in (MultiScaleFlipAug3D, MultiScaleFlipAug):
+            return None
+    elif callable(transform):
+        if isinstance(transform, loading_functions):
+            return True
+        if isinstance(transform, (MultiScaleFlipAug3D, MultiScaleFlipAug)):
+            return None
+    return False
+
+
+def get_loading_pipeline(pipeline):
+    """Only keep loading image, points and annotations related configuration.
+
+    Args:
+        pipeline (list[dict] | list[:obj:`Pipeline`]):
+            Data pipeline configs or list of pipeline functions.
+
+    Returns:
+        list[dict] | list[:obj:`Pipeline`]): The new pipeline list with only
+            keep loading image, points and annotations related configuration.
+
+    Examples:
+        >>> pipelines = [
+        ...    dict(type='LoadPointsFromFile',
+        ...         coord_type='LIDAR', load_dim=4, use_dim=4),
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations3D',
+        ...         with_bbox=True, with_label_3d=True),
+        ...    dict(type='Resize',
+        ...         img_scale=[(640, 192), (2560, 768)], keep_ratio=True),
+        ...    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+        ...    dict(type='PointsRangeFilter',
+        ...         point_cloud_range=point_cloud_range),
+        ...    dict(type='ObjectRangeFilter',
+        ...         point_cloud_range=point_cloud_range),
+        ...    dict(type='PointShuffle'),
+        ...    dict(type='Normalize', **img_norm_cfg),
+        ...    dict(type='Pad', size_divisor=32),
+        ...    dict(type='DefaultFormatBundle3D', class_names=class_names),
+        ...    dict(type='Collect3D',
+        ...         keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ...    ]
+        >>> expected_pipelines = [
+        ...    dict(type='LoadPointsFromFile',
+        ...         coord_type='LIDAR', load_dim=4, use_dim=4),
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations3D',
+        ...         with_bbox=True, with_label_3d=True),
+        ...    dict(type='DefaultFormatBundle3D', class_names=class_names),
+        ...    dict(type='Collect3D',
+        ...         keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ...    ]
+        >>> assert expected_pipelines == \
+        ...        get_loading_pipeline(pipelines)
+    """
+    loading_pipeline = []
+    for transform in pipeline:
+        is_loading = is_loading_function(transform)
+        if is_loading is None:  # MultiScaleFlipAug3D
+            # extract its inner pipeline
+            if isinstance(transform, dict):
+                inner_pipeline = transform.get('transforms', [])
+            else:
+                inner_pipeline = transform.transforms.transforms
+            loading_pipeline.extend(get_loading_pipeline(inner_pipeline))
+        elif is_loading:
+            loading_pipeline.append(transform)
+    assert len(loading_pipeline) > 0, \
+        'The data pipeline in your config file must include ' \
+        'loading step.'
+    return loading_pipeline
+
+
+def extract_result_dict(results, key):
+    """Extract and return the data corresponding to key in result dict.
+
+    ``results`` is a dict output from `pipeline(input_dict)`, which is the
+        loaded data from ``Dataset`` class.
+    The data terms inside may be wrapped in list, tuple and DataContainer, so
+        this function essentially extracts data from these wrappers.
+
+    Args:
+        results (dict): Data loaded using pipeline.
+        key (str): Key of the desired data.
+
+    Returns:
+        np.ndarray | torch.Tensor: Data term.
+    """
+    if key not in results.keys():
+        return None
+    # results[key] may be data or list[data] or tuple[data]
+    # data may be wrapped inside DataContainer
+    data = results[key]
+    if isinstance(data, (list, tuple)):
+        data = data[0]
+    if isinstance(data, mmcv.parallel.DataContainer):
+        data = data._data
+    return data
diff --git a/mmdet3d/datasets/waymo_dataset.py b/mmdet3d/datasets/waymo_dataset.py
index 6e204df..7b32888 100644
--- a/mmdet3d/datasets/waymo_dataset.py
+++ b/mmdet3d/datasets/waymo_dataset.py
@@ -1,549 +1,549 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import tempfile
-from os import path as osp
-
-import mmcv
-import numpy as np
-import torch
-from mmcv.utils import print_log
-
-from ..core.bbox import Box3DMode, points_cam2img
-from .builder import DATASETS
-from .kitti_dataset import KittiDataset
-
-
-@DATASETS.register_module()
-class WaymoDataset(KittiDataset):
-    """Waymo Dataset.
-
-    This class serves as the API for experiments on the Waymo Dataset.
-
-    Please refer to `<https://waymo.com/open/download/>`_for data downloading.
-    It is recommended to symlink the dataset root to $MMDETECTION3D/data and
-    organize them as the doc shows.
-
-    Args:
-        data_root (str): Path of dataset root.
-        ann_file (str): Path of annotation file.
-        split (str): Split of input data.
-        pts_prefix (str, optional): Prefix of points files.
-            Defaults to 'velodyne'.
-        pipeline (list[dict], optional): Pipeline used for data processing.
-            Defaults to None.
-        classes (tuple[str], optional): Classes used in the dataset.
-            Defaults to None.
-        modality (dict, optional): Modality to specify the sensor data used
-            as input. Defaults to None.
-        box_type_3d (str, optional): Type of 3D box of this dataset.
-            Based on the `box_type_3d`, the dataset will encapsulate the box
-            to its original format then converted them to `box_type_3d`.
-            Defaults to 'LiDAR' in this dataset. Available options includes
-
-            - 'LiDAR': box in LiDAR coordinates
-            - 'Depth': box in depth coordinates, usually for indoor dataset
-            - 'Camera': box in camera coordinates
-        filter_empty_gt (bool, optional): Whether to filter empty GT.
-            Defaults to True.
-        test_mode (bool, optional): Whether the dataset is in test mode.
-            Defaults to False.
-        pcd_limit_range (list(float), optional): The range of point cloud used
-            to filter invalid predicted boxes.
-            Default: [-85, -85, -5, 85, 85, 5].
-    """
-
-    CLASSES = ('Car', 'Cyclist', 'Pedestrian')
-
-    def __init__(self,
-                 data_root,
-                 ann_file,
-                 split,
-                 pts_prefix='velodyne',
-                 pipeline=None,
-                 classes=None,
-                 modality=None,
-                 box_type_3d='LiDAR',
-                 filter_empty_gt=True,
-                 test_mode=False,
-                 load_interval=1,
-                 pcd_limit_range=[-85, -85, -5, 85, 85, 5],
-                 **kwargs):
-        super().__init__(
-            data_root=data_root,
-            ann_file=ann_file,
-            split=split,
-            pts_prefix=pts_prefix,
-            pipeline=pipeline,
-            classes=classes,
-            modality=modality,
-            box_type_3d=box_type_3d,
-            filter_empty_gt=filter_empty_gt,
-            test_mode=test_mode,
-            pcd_limit_range=pcd_limit_range,
-            **kwargs)
-
-        # to load a subset, just set the load_interval in the dataset config
-        self.data_infos = self.data_infos[::load_interval]
-        if hasattr(self, 'flag'):
-            self.flag = self.flag[::load_interval]
-
-    def _get_pts_filename(self, idx):
-        pts_filename = osp.join(self.root_split, self.pts_prefix,
-                                f'{idx:07d}.bin')
-        return pts_filename
-
-    def get_data_info(self, index):
-        """Get data info according to the given index.
-
-        Args:
-            index (int): Index of the sample data to get.
-
-        Returns:
-            dict: Standard input_dict consists of the
-                data information.
-
-                - sample_idx (str): sample index
-                - pts_filename (str): filename of point clouds
-                - img_prefix (str): prefix of image files
-                - img_info (dict): image info
-                - lidar2img (list[np.ndarray], optional): transformations from
-                    lidar to different cameras
-                - ann_info (dict): annotation info
-        """
-        info = self.data_infos[index]
-        sample_idx = info['image']['image_idx']
-        img_filename = os.path.join(self.data_root,
-                                    info['image']['image_path'])
-
-        # TODO: consider use torch.Tensor only
-        rect = info['calib']['R0_rect'].astype(np.float32)
-        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
-        P0 = info['calib']['P0'].astype(np.float32)
-        lidar2img = P0 @ rect @ Trv2c
-
-        pts_filename = self._get_pts_filename(sample_idx)
-        input_dict = dict(
-            sample_idx=sample_idx,
-            pts_filename=pts_filename,
-            img_prefix=None,
-            img_info=dict(filename=img_filename),
-            lidar2img=lidar2img)
-
-        if not self.test_mode:
-            annos = self.get_ann_info(index)
-            input_dict['ann_info'] = annos
-
-        return input_dict
-
-    def format_results(self,
-                       outputs,
-                       pklfile_prefix=None,
-                       submission_prefix=None,
-                       data_format='waymo'):
-        """Format the results to pkl file.
-
-        Args:
-            outputs (list[dict]): Testing results of the dataset.
-            pklfile_prefix (str): The prefix of pkl files. It includes
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-            submission_prefix (str): The prefix of submitted files. It
-                includes the file path and the prefix of filename, e.g.,
-                "a/b/prefix". If not specified, a temp file will be created.
-                Default: None.
-            data_format (str, optional): Output data format.
-                Default: 'waymo'. Another supported choice is 'kitti'.
-
-        Returns:
-            tuple: (result_files, tmp_dir), result_files is a dict containing
-                the json filepaths, tmp_dir is the temporal directory created
-                for saving json files when jsonfile_prefix is not specified.
-        """
-        if pklfile_prefix is None:
-            tmp_dir = tempfile.TemporaryDirectory()
-            pklfile_prefix = osp.join(tmp_dir.name, 'results')
-        else:
-            tmp_dir = None
-
-        assert ('waymo' in data_format or 'kitti' in data_format), \
-            f'invalid data_format {data_format}'
-
-        if (not isinstance(outputs[0], dict)) or 'img_bbox' in outputs[0]:
-            raise TypeError('Not supported type for reformat results.')
-        elif 'pts_bbox' in outputs[0]:
-            result_files = dict()
-            for name in outputs[0]:
-                results_ = [out[name] for out in outputs]
-                pklfile_prefix_ = pklfile_prefix + name
-                if submission_prefix is not None:
-                    submission_prefix_ = f'{submission_prefix}_{name}'
-                else:
-                    submission_prefix_ = None
-                result_files_ = self.bbox2result_kitti(results_, self.CLASSES,
-                                                       pklfile_prefix_,
-                                                       submission_prefix_)
-                result_files[name] = result_files_
-        else:
-            result_files = self.bbox2result_kitti(outputs, self.CLASSES,
-                                                  pklfile_prefix,
-                                                  submission_prefix)
-        if 'waymo' in data_format:
-            from ..core.evaluation.waymo_utils.prediction_kitti_to_waymo import \
-                KITTI2Waymo  # noqa
-            waymo_root = osp.join(
-                self.data_root.split('kitti_format')[0], 'waymo_format')
-            if self.split == 'training':
-                waymo_tfrecords_dir = osp.join(waymo_root, 'validation')
-                prefix = '1'
-            elif self.split == 'testing':
-                waymo_tfrecords_dir = osp.join(waymo_root, 'testing')
-                prefix = '2'
-            else:
-                raise ValueError('Not supported split value.')
-            save_tmp_dir = tempfile.TemporaryDirectory()
-            waymo_results_save_dir = save_tmp_dir.name
-            waymo_results_final_path = f'{pklfile_prefix}.bin'
-            if 'pts_bbox' in result_files:
-                converter = KITTI2Waymo(result_files['pts_bbox'],
-                                        waymo_tfrecords_dir,
-                                        waymo_results_save_dir,
-                                        waymo_results_final_path, prefix)
-            else:
-                converter = KITTI2Waymo(result_files, waymo_tfrecords_dir,
-                                        waymo_results_save_dir,
-                                        waymo_results_final_path, prefix)
-            converter.convert()
-            save_tmp_dir.cleanup()
-
-        return result_files, tmp_dir
-
-    def evaluate(self,
-                 results,
-                 metric='waymo',
-                 logger=None,
-                 pklfile_prefix=None,
-                 submission_prefix=None,
-                 show=False,
-                 out_dir=None,
-                 pipeline=None):
-        """Evaluation in KITTI protocol.
-
-        Args:
-            results (list[dict]): Testing results of the dataset.
-            metric (str | list[str], optional): Metrics to be evaluated.
-                Default: 'waymo'. Another supported metric is 'kitti'.
-            logger (logging.Logger | str, optional): Logger used for printing
-                related information during evaluation. Default: None.
-            pklfile_prefix (str, optional): The prefix of pkl files including
-                the file path and the prefix of filename, e.g., "a/b/prefix".
-                If not specified, a temp file will be created. Default: None.
-            submission_prefix (str, optional): The prefix of submission data.
-                If not specified, the submission data will not be generated.
-            show (bool, optional): Whether to visualize.
-                Default: False.
-            out_dir (str, optional): Path to save the visualization results.
-                Default: None.
-            pipeline (list[dict], optional): raw data loading for showing.
-                Default: None.
-
-        Returns:
-            dict[str: float]: results of each evaluation metric
-        """
-        assert ('waymo' in metric or 'kitti' in metric), \
-            f'invalid metric {metric}'
-        if 'kitti' in metric:
-            result_files, tmp_dir = self.format_results(
-                results,
-                pklfile_prefix,
-                submission_prefix,
-                data_format='kitti')
-            from mmdet3d.core.evaluation import kitti_eval
-            gt_annos = [info['annos'] for info in self.data_infos]
-
-            if isinstance(result_files, dict):
-                ap_dict = dict()
-                for name, result_files_ in result_files.items():
-                    eval_types = ['bev', '3d']
-                    ap_result_str, ap_dict_ = kitti_eval(
-                        gt_annos,
-                        result_files_,
-                        self.CLASSES,
-                        eval_types=eval_types)
-                    for ap_type, ap in ap_dict_.items():
-                        ap_dict[f'{name}/{ap_type}'] = float(
-                            '{:.4f}'.format(ap))
-
-                    print_log(
-                        f'Results of {name}:\n' + ap_result_str, logger=logger)
-
-            else:
-                ap_result_str, ap_dict = kitti_eval(
-                    gt_annos,
-                    result_files,
-                    self.CLASSES,
-                    eval_types=['bev', '3d'])
-                print_log('\n' + ap_result_str, logger=logger)
-        if 'waymo' in metric:
-            waymo_root = osp.join(
-                self.data_root.split('kitti_format')[0], 'waymo_format')
-            if pklfile_prefix is None:
-                eval_tmp_dir = tempfile.TemporaryDirectory()
-                pklfile_prefix = osp.join(eval_tmp_dir.name, 'results')
-            else:
-                eval_tmp_dir = None
-            result_files, tmp_dir = self.format_results(
-                results,
-                pklfile_prefix,
-                submission_prefix,
-                data_format='waymo')
-            import subprocess
-            ret_bytes = subprocess.check_output(
-                'mmdet3d/core/evaluation/waymo_utils/' +
-                f'compute_detection_metrics_main {pklfile_prefix}.bin ' +
-                f'{waymo_root}/gt.bin',
-                shell=True)
-            ret_texts = ret_bytes.decode('utf-8')
-            print_log(ret_texts)
-            # parse the text to get ap_dict
-            ap_dict = {
-                'Vehicle/L1 mAP': 0,
-                'Vehicle/L1 mAPH': 0,
-                'Vehicle/L2 mAP': 0,
-                'Vehicle/L2 mAPH': 0,
-                'Pedestrian/L1 mAP': 0,
-                'Pedestrian/L1 mAPH': 0,
-                'Pedestrian/L2 mAP': 0,
-                'Pedestrian/L2 mAPH': 0,
-                'Sign/L1 mAP': 0,
-                'Sign/L1 mAPH': 0,
-                'Sign/L2 mAP': 0,
-                'Sign/L2 mAPH': 0,
-                'Cyclist/L1 mAP': 0,
-                'Cyclist/L1 mAPH': 0,
-                'Cyclist/L2 mAP': 0,
-                'Cyclist/L2 mAPH': 0,
-                'Overall/L1 mAP': 0,
-                'Overall/L1 mAPH': 0,
-                'Overall/L2 mAP': 0,
-                'Overall/L2 mAPH': 0
-            }
-            mAP_splits = ret_texts.split('mAP ')
-            mAPH_splits = ret_texts.split('mAPH ')
-            for idx, key in enumerate(ap_dict.keys()):
-                split_idx = int(idx / 2) + 1
-                if idx % 2 == 0:  # mAP
-                    ap_dict[key] = float(mAP_splits[split_idx].split(']')[0])
-                else:  # mAPH
-                    ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0])
-            ap_dict['Overall/L1 mAP'] = \
-                (ap_dict['Vehicle/L1 mAP'] + ap_dict['Pedestrian/L1 mAP'] +
-                 ap_dict['Cyclist/L1 mAP']) / 3
-            ap_dict['Overall/L1 mAPH'] = \
-                (ap_dict['Vehicle/L1 mAPH'] + ap_dict['Pedestrian/L1 mAPH'] +
-                 ap_dict['Cyclist/L1 mAPH']) / 3
-            ap_dict['Overall/L2 mAP'] = \
-                (ap_dict['Vehicle/L2 mAP'] + ap_dict['Pedestrian/L2 mAP'] +
-                 ap_dict['Cyclist/L2 mAP']) / 3
-            ap_dict['Overall/L2 mAPH'] = \
-                (ap_dict['Vehicle/L2 mAPH'] + ap_dict['Pedestrian/L2 mAPH'] +
-                 ap_dict['Cyclist/L2 mAPH']) / 3
-            if eval_tmp_dir is not None:
-                eval_tmp_dir.cleanup()
-
-        if tmp_dir is not None:
-            tmp_dir.cleanup()
-
-        if show or out_dir:
-            self.show(results, out_dir, show=show, pipeline=pipeline)
-        return ap_dict
-
-    def bbox2result_kitti(self,
-                          net_outputs,
-                          class_names,
-                          pklfile_prefix=None,
-                          submission_prefix=None):
-        """Convert results to kitti format for evaluation and test submission.
-
-        Args:
-            net_outputs (List[np.ndarray]): list of array storing the
-                bbox and score
-            class_nanes (List[String]): A list of class names
-            pklfile_prefix (str): The prefix of pkl file.
-            submission_prefix (str): The prefix of submission file.
-
-        Returns:
-            List[dict]: A list of dict have the kitti 3d format
-        """
-        assert len(net_outputs) == len(self.data_infos), \
-            'invalid list length of network outputs'
-        if submission_prefix is not None:
-            mmcv.mkdir_or_exist(submission_prefix)
-
-        det_annos = []
-        print('\nConverting prediction to KITTI format')
-        for idx, pred_dicts in enumerate(
-                mmcv.track_iter_progress(net_outputs)):
-            annos = []
-            info = self.data_infos[idx]
-            sample_idx = info['image']['image_idx']
-            image_shape = info['image']['image_shape'][:2]
-
-            box_dict = self.convert_valid_bboxes(pred_dicts, info)
-            if len(box_dict['bbox']) > 0:
-                box_2d_preds = box_dict['bbox']
-                box_preds = box_dict['box3d_camera']
-                scores = box_dict['scores']
-                box_preds_lidar = box_dict['box3d_lidar']
-                label_preds = box_dict['label_preds']
-
-                anno = {
-                    'name': [],
-                    'truncated': [],
-                    'occluded': [],
-                    'alpha': [],
-                    'bbox': [],
-                    'dimensions': [],
-                    'location': [],
-                    'rotation_y': [],
-                    'score': []
-                }
-
-                for box, box_lidar, bbox, score, label in zip(
-                        box_preds, box_preds_lidar, box_2d_preds, scores,
-                        label_preds):
-                    bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
-                    bbox[:2] = np.maximum(bbox[:2], [0, 0])
-                    anno['name'].append(class_names[int(label)])
-                    anno['truncated'].append(0.0)
-                    anno['occluded'].append(0)
-                    anno['alpha'].append(
-                        -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6])
-                    anno['bbox'].append(bbox)
-                    anno['dimensions'].append(box[3:6])
-                    anno['location'].append(box[:3])
-                    anno['rotation_y'].append(box[6])
-                    anno['score'].append(score)
-
-                anno = {k: np.stack(v) for k, v in anno.items()}
-                annos.append(anno)
-
-                if submission_prefix is not None:
-                    curr_file = f'{submission_prefix}/{sample_idx:07d}.txt'
-                    with open(curr_file, 'w') as f:
-                        bbox = anno['bbox']
-                        loc = anno['location']
-                        dims = anno['dimensions']  # lhw -> hwl
-
-                        for idx in range(len(bbox)):
-                            print(
-                                '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
-                                '{:.4f} {:.4f} {:.4f} '
-                                '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.
-                                format(anno['name'][idx], anno['alpha'][idx],
-                                       bbox[idx][0], bbox[idx][1],
-                                       bbox[idx][2], bbox[idx][3],
-                                       dims[idx][1], dims[idx][2],
-                                       dims[idx][0], loc[idx][0], loc[idx][1],
-                                       loc[idx][2], anno['rotation_y'][idx],
-                                       anno['score'][idx]),
-                                file=f)
-            else:
-                annos.append({
-                    'name': np.array([]),
-                    'truncated': np.array([]),
-                    'occluded': np.array([]),
-                    'alpha': np.array([]),
-                    'bbox': np.zeros([0, 4]),
-                    'dimensions': np.zeros([0, 3]),
-                    'location': np.zeros([0, 3]),
-                    'rotation_y': np.array([]),
-                    'score': np.array([]),
-                })
-            annos[-1]['sample_idx'] = np.array(
-                [sample_idx] * len(annos[-1]['score']), dtype=np.int64)
-
-            det_annos += annos
-
-        if pklfile_prefix is not None:
-            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
-                out = f'{pklfile_prefix}.pkl'
-            mmcv.dump(det_annos, out)
-            print(f'Result is saved to {out}.')
-
-        return det_annos
-
-    def convert_valid_bboxes(self, box_dict, info):
-        """Convert the boxes into valid format.
-
-        Args:
-            box_dict (dict): Bounding boxes to be converted.
-
-                - boxes_3d (:obj:``LiDARInstance3DBoxes``): 3D bounding boxes.
-                - scores_3d (np.ndarray): Scores of predicted boxes.
-                - labels_3d (np.ndarray): Class labels of predicted boxes.
-            info (dict): Dataset information dictionary.
-
-        Returns:
-            dict: Valid boxes after conversion.
-
-                - bbox (np.ndarray): 2D bounding boxes (in camera 0).
-                - box3d_camera (np.ndarray): 3D boxes in camera coordinates.
-                - box3d_lidar (np.ndarray): 3D boxes in lidar coordinates.
-                - scores (np.ndarray): Scores of predicted boxes.
-                - label_preds (np.ndarray): Class labels of predicted boxes.
-                - sample_idx (np.ndarray): Sample index.
-        """
-        # TODO: refactor this function
-        box_preds = box_dict['boxes_3d']
-        scores = box_dict['scores_3d']
-        labels = box_dict['labels_3d']
-        sample_idx = info['image']['image_idx']
-        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
-
-        if len(box_preds) == 0:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0, 4]),
-                sample_idx=sample_idx)
-
-        rect = info['calib']['R0_rect'].astype(np.float32)
-        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
-        P0 = info['calib']['P0'].astype(np.float32)
-        P0 = box_preds.tensor.new_tensor(P0)
-
-        box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c)
-
-        box_corners = box_preds_camera.corners
-        box_corners_in_image = points_cam2img(box_corners, P0)
-        # box_corners_in_image: [N, 8, 2]
-        minxy = torch.min(box_corners_in_image, dim=1)[0]
-        maxxy = torch.max(box_corners_in_image, dim=1)[0]
-        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
-        # Post-processing
-        # check box_preds
-        limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
-        valid_pcd_inds = ((box_preds.center > limit_range[:3]) &
-                          (box_preds.center < limit_range[3:]))
-        valid_inds = valid_pcd_inds.all(-1)
-
-        if valid_inds.sum() > 0:
-            return dict(
-                bbox=box_2d_preds[valid_inds, :].numpy(),
-                box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
-                box3d_lidar=box_preds[valid_inds].tensor.numpy(),
-                scores=scores[valid_inds].numpy(),
-                label_preds=labels[valid_inds].numpy(),
-                sample_idx=sample_idx,
-            )
-        else:
-            return dict(
-                bbox=np.zeros([0, 4]),
-                box3d_camera=np.zeros([0, 7]),
-                box3d_lidar=np.zeros([0, 7]),
-                scores=np.zeros([0]),
-                label_preds=np.zeros([0, 4]),
-                sample_idx=sample_idx,
-            )
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+from os import path as osp
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.utils import print_log
+
+from ..core.bbox import Box3DMode, points_cam2img
+from .builder import DATASETS
+from .kitti_dataset import KittiDataset
+
+
+@DATASETS.register_module()
+class WaymoDataset(KittiDataset):
+    """Waymo Dataset.
+
+    This class serves as the API for experiments on the Waymo Dataset.
+
+    Please refer to `<https://waymo.com/open/download/>`_for data downloading.
+    It is recommended to symlink the dataset root to $MMDETECTION3D/data and
+    organize them as the doc shows.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        split (str): Split of input data.
+        pts_prefix (str, optional): Prefix of points files.
+            Defaults to 'velodyne'.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes
+
+            - 'LiDAR': box in LiDAR coordinates
+            - 'Depth': box in depth coordinates, usually for indoor dataset
+            - 'Camera': box in camera coordinates
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+        pcd_limit_range (list(float), optional): The range of point cloud used
+            to filter invalid predicted boxes.
+            Default: [-85, -85, -5, 85, 85, 5].
+    """
+
+    CLASSES = ('Car', 'Cyclist', 'Pedestrian')
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 split,
+                 pts_prefix='velodyne',
+                 pipeline=None,
+                 classes=None,
+                 modality=None,
+                 box_type_3d='LiDAR',
+                 filter_empty_gt=True,
+                 test_mode=False,
+                 load_interval=1,
+                 pcd_limit_range=[-85, -85, -5, 85, 85, 5],
+                 **kwargs):
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            split=split,
+            pts_prefix=pts_prefix,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+            pcd_limit_range=pcd_limit_range,
+            **kwargs)
+
+        # to load a subset, just set the load_interval in the dataset config
+        self.data_infos = self.data_infos[::load_interval]
+        if hasattr(self, 'flag'):
+            self.flag = self.flag[::load_interval]
+
+    def _get_pts_filename(self, idx):
+        pts_filename = osp.join(self.root_split, self.pts_prefix,
+                                f'{idx:07d}.bin')
+        return pts_filename
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Standard input_dict consists of the
+                data information.
+
+                - sample_idx (str): sample index
+                - pts_filename (str): filename of point clouds
+                - img_prefix (str): prefix of image files
+                - img_info (dict): image info
+                - lidar2img (list[np.ndarray], optional): transformations from
+                    lidar to different cameras
+                - ann_info (dict): annotation info
+        """
+        info = self.data_infos[index]
+        sample_idx = info['image']['image_idx']
+        img_filename = os.path.join(self.data_root,
+                                    info['image']['image_path'])
+
+        # TODO: consider use torch.Tensor only
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        P0 = info['calib']['P0'].astype(np.float32)
+        lidar2img = P0 @ rect @ Trv2c
+
+        pts_filename = self._get_pts_filename(sample_idx)
+        input_dict = dict(
+            sample_idx=sample_idx,
+            pts_filename=pts_filename,
+            img_prefix=None,
+            img_info=dict(filename=img_filename),
+            lidar2img=lidar2img)
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        return input_dict
+
+    def format_results(self,
+                       outputs,
+                       pklfile_prefix=None,
+                       submission_prefix=None,
+                       data_format='waymo'):
+        """Format the results to pkl file.
+
+        Args:
+            outputs (list[dict]): Testing results of the dataset.
+            pklfile_prefix (str): The prefix of pkl files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            submission_prefix (str): The prefix of submitted files. It
+                includes the file path and the prefix of filename, e.g.,
+                "a/b/prefix". If not specified, a temp file will be created.
+                Default: None.
+            data_format (str, optional): Output data format.
+                Default: 'waymo'. Another supported choice is 'kitti'.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing
+                the json filepaths, tmp_dir is the temporal directory created
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        if pklfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            pklfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        assert ('waymo' in data_format or 'kitti' in data_format), \
+            f'invalid data_format {data_format}'
+
+        if (not isinstance(outputs[0], dict)) or 'img_bbox' in outputs[0]:
+            raise TypeError('Not supported type for reformat results.')
+        elif 'pts_bbox' in outputs[0]:
+            result_files = dict()
+            for name in outputs[0]:
+                results_ = [out[name] for out in outputs]
+                pklfile_prefix_ = pklfile_prefix + name
+                if submission_prefix is not None:
+                    submission_prefix_ = f'{submission_prefix}_{name}'
+                else:
+                    submission_prefix_ = None
+                result_files_ = self.bbox2result_kitti(results_, self.CLASSES,
+                                                       pklfile_prefix_,
+                                                       submission_prefix_)
+                result_files[name] = result_files_
+        else:
+            result_files = self.bbox2result_kitti(outputs, self.CLASSES,
+                                                  pklfile_prefix,
+                                                  submission_prefix)
+        if 'waymo' in data_format:
+            from ..core.evaluation.waymo_utils.prediction_kitti_to_waymo import \
+                KITTI2Waymo  # noqa
+            waymo_root = osp.join(
+                self.data_root.split('kitti_format')[0], 'waymo_format')
+            if self.split == 'training':
+                waymo_tfrecords_dir = osp.join(waymo_root, 'validation')
+                prefix = '1'
+            elif self.split == 'testing':
+                waymo_tfrecords_dir = osp.join(waymo_root, 'testing')
+                prefix = '2'
+            else:
+                raise ValueError('Not supported split value.')
+            save_tmp_dir = tempfile.TemporaryDirectory()
+            waymo_results_save_dir = save_tmp_dir.name
+            waymo_results_final_path = f'{pklfile_prefix}.bin'
+            if 'pts_bbox' in result_files:
+                converter = KITTI2Waymo(result_files['pts_bbox'],
+                                        waymo_tfrecords_dir,
+                                        waymo_results_save_dir,
+                                        waymo_results_final_path, prefix)
+            else:
+                converter = KITTI2Waymo(result_files, waymo_tfrecords_dir,
+                                        waymo_results_save_dir,
+                                        waymo_results_final_path, prefix)
+            converter.convert()
+            save_tmp_dir.cleanup()
+
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='waymo',
+                 logger=None,
+                 pklfile_prefix=None,
+                 submission_prefix=None,
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in KITTI protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str], optional): Metrics to be evaluated.
+                Default: 'waymo'. Another supported metric is 'kitti'.
+            logger (logging.Logger | str, optional): Logger used for printing
+                related information during evaluation. Default: None.
+            pklfile_prefix (str, optional): The prefix of pkl files including
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            submission_prefix (str, optional): The prefix of submission data.
+                If not specified, the submission data will not be generated.
+            show (bool, optional): Whether to visualize.
+                Default: False.
+            out_dir (str, optional): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str: float]: results of each evaluation metric
+        """
+        assert ('waymo' in metric or 'kitti' in metric), \
+            f'invalid metric {metric}'
+        if 'kitti' in metric:
+            result_files, tmp_dir = self.format_results(
+                results,
+                pklfile_prefix,
+                submission_prefix,
+                data_format='kitti')
+            from mmdet3d.core.evaluation import kitti_eval
+            gt_annos = [info['annos'] for info in self.data_infos]
+
+            if isinstance(result_files, dict):
+                ap_dict = dict()
+                for name, result_files_ in result_files.items():
+                    eval_types = ['bev', '3d']
+                    ap_result_str, ap_dict_ = kitti_eval(
+                        gt_annos,
+                        result_files_,
+                        self.CLASSES,
+                        eval_types=eval_types)
+                    for ap_type, ap in ap_dict_.items():
+                        ap_dict[f'{name}/{ap_type}'] = float(
+                            '{:.4f}'.format(ap))
+
+                    print_log(
+                        f'Results of {name}:\n' + ap_result_str, logger=logger)
+
+            else:
+                ap_result_str, ap_dict = kitti_eval(
+                    gt_annos,
+                    result_files,
+                    self.CLASSES,
+                    eval_types=['bev', '3d'])
+                print_log('\n' + ap_result_str, logger=logger)
+        if 'waymo' in metric:
+            waymo_root = osp.join(
+                self.data_root.split('kitti_format')[0], 'waymo_format')
+            if pklfile_prefix is None:
+                eval_tmp_dir = tempfile.TemporaryDirectory()
+                pklfile_prefix = osp.join(eval_tmp_dir.name, 'results')
+            else:
+                eval_tmp_dir = None
+            result_files, tmp_dir = self.format_results(
+                results,
+                pklfile_prefix,
+                submission_prefix,
+                data_format='waymo')
+            import subprocess
+            ret_bytes = subprocess.check_output(
+                'mmdet3d/core/evaluation/waymo_utils/' +
+                f'compute_detection_metrics_main {pklfile_prefix}.bin ' +
+                f'{waymo_root}/gt.bin',
+                shell=True)
+            ret_texts = ret_bytes.decode('utf-8')
+            print_log(ret_texts)
+            # parse the text to get ap_dict
+            ap_dict = {
+                'Vehicle/L1 mAP': 0,
+                'Vehicle/L1 mAPH': 0,
+                'Vehicle/L2 mAP': 0,
+                'Vehicle/L2 mAPH': 0,
+                'Pedestrian/L1 mAP': 0,
+                'Pedestrian/L1 mAPH': 0,
+                'Pedestrian/L2 mAP': 0,
+                'Pedestrian/L2 mAPH': 0,
+                'Sign/L1 mAP': 0,
+                'Sign/L1 mAPH': 0,
+                'Sign/L2 mAP': 0,
+                'Sign/L2 mAPH': 0,
+                'Cyclist/L1 mAP': 0,
+                'Cyclist/L1 mAPH': 0,
+                'Cyclist/L2 mAP': 0,
+                'Cyclist/L2 mAPH': 0,
+                'Overall/L1 mAP': 0,
+                'Overall/L1 mAPH': 0,
+                'Overall/L2 mAP': 0,
+                'Overall/L2 mAPH': 0
+            }
+            mAP_splits = ret_texts.split('mAP ')
+            mAPH_splits = ret_texts.split('mAPH ')
+            for idx, key in enumerate(ap_dict.keys()):
+                split_idx = int(idx / 2) + 1
+                if idx % 2 == 0:  # mAP
+                    ap_dict[key] = float(mAP_splits[split_idx].split(']')[0])
+                else:  # mAPH
+                    ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0])
+            ap_dict['Overall/L1 mAP'] = \
+                (ap_dict['Vehicle/L1 mAP'] + ap_dict['Pedestrian/L1 mAP'] +
+                 ap_dict['Cyclist/L1 mAP']) / 3
+            ap_dict['Overall/L1 mAPH'] = \
+                (ap_dict['Vehicle/L1 mAPH'] + ap_dict['Pedestrian/L1 mAPH'] +
+                 ap_dict['Cyclist/L1 mAPH']) / 3
+            ap_dict['Overall/L2 mAP'] = \
+                (ap_dict['Vehicle/L2 mAP'] + ap_dict['Pedestrian/L2 mAP'] +
+                 ap_dict['Cyclist/L2 mAP']) / 3
+            ap_dict['Overall/L2 mAPH'] = \
+                (ap_dict['Vehicle/L2 mAPH'] + ap_dict['Pedestrian/L2 mAPH'] +
+                 ap_dict['Cyclist/L2 mAPH']) / 3
+            if eval_tmp_dir is not None:
+                eval_tmp_dir.cleanup()
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show or out_dir:
+            self.show(results, out_dir, show=show, pipeline=pipeline)
+        return ap_dict
+
+    def bbox2result_kitti(self,
+                          net_outputs,
+                          class_names,
+                          pklfile_prefix=None,
+                          submission_prefix=None):
+        """Convert results to kitti format for evaluation and test submission.
+
+        Args:
+            net_outputs (List[np.ndarray]): list of array storing the
+                bbox and score
+            class_nanes (List[String]): A list of class names
+            pklfile_prefix (str): The prefix of pkl file.
+            submission_prefix (str): The prefix of submission file.
+
+        Returns:
+            List[dict]: A list of dict have the kitti 3d format
+        """
+        assert len(net_outputs) == len(self.data_infos), \
+            'invalid list length of network outputs'
+        if submission_prefix is not None:
+            mmcv.mkdir_or_exist(submission_prefix)
+
+        det_annos = []
+        print('\nConverting prediction to KITTI format')
+        for idx, pred_dicts in enumerate(
+                mmcv.track_iter_progress(net_outputs)):
+            annos = []
+            info = self.data_infos[idx]
+            sample_idx = info['image']['image_idx']
+            image_shape = info['image']['image_shape'][:2]
+
+            box_dict = self.convert_valid_bboxes(pred_dicts, info)
+            if len(box_dict['bbox']) > 0:
+                box_2d_preds = box_dict['bbox']
+                box_preds = box_dict['box3d_camera']
+                scores = box_dict['scores']
+                box_preds_lidar = box_dict['box3d_lidar']
+                label_preds = box_dict['label_preds']
+
+                anno = {
+                    'name': [],
+                    'truncated': [],
+                    'occluded': [],
+                    'alpha': [],
+                    'bbox': [],
+                    'dimensions': [],
+                    'location': [],
+                    'rotation_y': [],
+                    'score': []
+                }
+
+                for box, box_lidar, bbox, score, label in zip(
+                        box_preds, box_preds_lidar, box_2d_preds, scores,
+                        label_preds):
+                    bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
+                    bbox[:2] = np.maximum(bbox[:2], [0, 0])
+                    anno['name'].append(class_names[int(label)])
+                    anno['truncated'].append(0.0)
+                    anno['occluded'].append(0)
+                    anno['alpha'].append(
+                        -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6])
+                    anno['bbox'].append(bbox)
+                    anno['dimensions'].append(box[3:6])
+                    anno['location'].append(box[:3])
+                    anno['rotation_y'].append(box[6])
+                    anno['score'].append(score)
+
+                anno = {k: np.stack(v) for k, v in anno.items()}
+                annos.append(anno)
+
+                if submission_prefix is not None:
+                    curr_file = f'{submission_prefix}/{sample_idx:07d}.txt'
+                    with open(curr_file, 'w') as f:
+                        bbox = anno['bbox']
+                        loc = anno['location']
+                        dims = anno['dimensions']  # lhw -> hwl
+
+                        for idx in range(len(bbox)):
+                            print(
+                                '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
+                                '{:.4f} {:.4f} {:.4f} '
+                                '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.
+                                format(anno['name'][idx], anno['alpha'][idx],
+                                       bbox[idx][0], bbox[idx][1],
+                                       bbox[idx][2], bbox[idx][3],
+                                       dims[idx][1], dims[idx][2],
+                                       dims[idx][0], loc[idx][0], loc[idx][1],
+                                       loc[idx][2], anno['rotation_y'][idx],
+                                       anno['score'][idx]),
+                                file=f)
+            else:
+                annos.append({
+                    'name': np.array([]),
+                    'truncated': np.array([]),
+                    'occluded': np.array([]),
+                    'alpha': np.array([]),
+                    'bbox': np.zeros([0, 4]),
+                    'dimensions': np.zeros([0, 3]),
+                    'location': np.zeros([0, 3]),
+                    'rotation_y': np.array([]),
+                    'score': np.array([]),
+                })
+            annos[-1]['sample_idx'] = np.array(
+                [sample_idx] * len(annos[-1]['score']), dtype=np.int64)
+
+            det_annos += annos
+
+        if pklfile_prefix is not None:
+            if not pklfile_prefix.endswith(('.pkl', '.pickle')):
+                out = f'{pklfile_prefix}.pkl'
+            mmcv.dump(det_annos, out)
+            print(f'Result is saved to {out}.')
+
+        return det_annos
+
+    def convert_valid_bboxes(self, box_dict, info):
+        """Convert the boxes into valid format.
+
+        Args:
+            box_dict (dict): Bounding boxes to be converted.
+
+                - boxes_3d (:obj:``LiDARInstance3DBoxes``): 3D bounding boxes.
+                - scores_3d (np.ndarray): Scores of predicted boxes.
+                - labels_3d (np.ndarray): Class labels of predicted boxes.
+            info (dict): Dataset information dictionary.
+
+        Returns:
+            dict: Valid boxes after conversion.
+
+                - bbox (np.ndarray): 2D bounding boxes (in camera 0).
+                - box3d_camera (np.ndarray): 3D boxes in camera coordinates.
+                - box3d_lidar (np.ndarray): 3D boxes in lidar coordinates.
+                - scores (np.ndarray): Scores of predicted boxes.
+                - label_preds (np.ndarray): Class labels of predicted boxes.
+                - sample_idx (np.ndarray): Sample index.
+        """
+        # TODO: refactor this function
+        box_preds = box_dict['boxes_3d']
+        scores = box_dict['scores_3d']
+        labels = box_dict['labels_3d']
+        sample_idx = info['image']['image_idx']
+        box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
+
+        if len(box_preds) == 0:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
+                sample_idx=sample_idx)
+
+        rect = info['calib']['R0_rect'].astype(np.float32)
+        Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
+        P0 = info['calib']['P0'].astype(np.float32)
+        P0 = box_preds.tensor.new_tensor(P0)
+
+        box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c)
+
+        box_corners = box_preds_camera.corners
+        box_corners_in_image = points_cam2img(box_corners, P0)
+        # box_corners_in_image: [N, 8, 2]
+        minxy = torch.min(box_corners_in_image, dim=1)[0]
+        maxxy = torch.max(box_corners_in_image, dim=1)[0]
+        box_2d_preds = torch.cat([minxy, maxxy], dim=1)
+        # Post-processing
+        # check box_preds
+        limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
+        valid_pcd_inds = ((box_preds.center > limit_range[:3]) &
+                          (box_preds.center < limit_range[3:]))
+        valid_inds = valid_pcd_inds.all(-1)
+
+        if valid_inds.sum() > 0:
+            return dict(
+                bbox=box_2d_preds[valid_inds, :].numpy(),
+                box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
+                box3d_lidar=box_preds[valid_inds].tensor.numpy(),
+                scores=scores[valid_inds].numpy(),
+                label_preds=labels[valid_inds].numpy(),
+                sample_idx=sample_idx,
+            )
+        else:
+            return dict(
+                bbox=np.zeros([0, 4]),
+                box3d_camera=np.zeros([0, 7]),
+                box3d_lidar=np.zeros([0, 7]),
+                scores=np.zeros([0]),
+                label_preds=np.zeros([0, 4]),
+                sample_idx=sample_idx,
+            )
diff --git a/mmdet3d/models/__init__.py b/mmdet3d/models/__init__.py
index 7c7e8fc..a6fb648 100644
--- a/mmdet3d/models/__init__.py
+++ b/mmdet3d/models/__init__.py
@@ -1,29 +1,29 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .backbones import *  # noqa: F401,F403
-from .builder import (BACKBONES, DETECTORS, FUSION_LAYERS, HEADS, LOSSES,
-                      MIDDLE_ENCODERS, NECKS, ROI_EXTRACTORS, SEGMENTORS,
-                      SHARED_HEADS, VOXEL_ENCODERS, build_backbone,
-                      build_detector, build_fusion_layer, build_head,
-                      build_loss, build_middle_encoder, build_model,
-                      build_neck, build_roi_extractor, build_shared_head,
-                      build_voxel_encoder)
-from .decode_heads import *  # noqa: F401,F403
-from .dense_heads import *  # noqa: F401,F403
-from .detectors import *  # noqa: F401,F403
-from .fusion_layers import *  # noqa: F401,F403
-from .losses import *  # noqa: F401,F403
-from .middle_encoders import *  # noqa: F401,F403
-from .model_utils import *  # noqa: F401,F403
-from .necks import *  # noqa: F401,F403
-from .roi_heads import *  # noqa: F401,F403
-from .segmentors import *  # noqa: F401,F403
-from .voxel_encoders import *  # noqa: F401,F403
-
-__all__ = [
-    'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'SHARED_HEADS', 'HEADS', 'LOSSES',
-    'DETECTORS', 'SEGMENTORS', 'VOXEL_ENCODERS', 'MIDDLE_ENCODERS',
-    'FUSION_LAYERS', 'build_backbone', 'build_neck', 'build_roi_extractor',
-    'build_shared_head', 'build_head', 'build_loss', 'build_detector',
-    'build_fusion_layer', 'build_model', 'build_middle_encoder',
-    'build_voxel_encoder'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .backbones import *  # noqa: F401,F403
+from .builder import (BACKBONES, DETECTORS, FUSION_LAYERS, HEADS, LOSSES,
+                      MIDDLE_ENCODERS, NECKS, ROI_EXTRACTORS, SEGMENTORS,
+                      SHARED_HEADS, VOXEL_ENCODERS, build_backbone,
+                      build_detector, build_fusion_layer, build_head,
+                      build_loss, build_middle_encoder, build_model,
+                      build_neck, build_roi_extractor, build_shared_head,
+                      build_voxel_encoder)
+from .decode_heads import *  # noqa: F401,F403
+from .dense_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .fusion_layers import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .middle_encoders import *  # noqa: F401,F403
+from .model_utils import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .roi_heads import *  # noqa: F401,F403
+from .segmentors import *  # noqa: F401,F403
+from .voxel_encoders import *  # noqa: F401,F403
+
+__all__ = [
+    'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'SHARED_HEADS', 'HEADS', 'LOSSES',
+    'DETECTORS', 'SEGMENTORS', 'VOXEL_ENCODERS', 'MIDDLE_ENCODERS',
+    'FUSION_LAYERS', 'build_backbone', 'build_neck', 'build_roi_extractor',
+    'build_shared_head', 'build_head', 'build_loss', 'build_detector',
+    'build_fusion_layer', 'build_model', 'build_middle_encoder',
+    'build_voxel_encoder'
+]
diff --git a/mmdet3d/models/backbones/__init__.py b/mmdet3d/models/backbones/__init__.py
index d51c16d..6b6a0b4 100644
--- a/mmdet3d/models/backbones/__init__.py
+++ b/mmdet3d/models/backbones/__init__.py
@@ -1,16 +1,16 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.models.backbones import SSDVGG, HRNet, ResNet, ResNetV1d, ResNeXt
-from .dgcnn import DGCNNBackbone
-from .dla import DLANet
-from .mink_resnet import MinkResNet
-from .multi_backbone import MultiBackbone
-from .nostem_regnet import NoStemRegNet
-from .pointnet2_sa_msg import PointNet2SAMSG
-from .pointnet2_sa_ssg import PointNet2SASSG
-from .second import SECOND
-
-__all__ = [
-    'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'NoStemRegNet',
-    'SECOND', 'DGCNNBackbone', 'PointNet2SASSG', 'PointNet2SAMSG',
-    'MultiBackbone', 'DLANet', 'MinkResNet'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.backbones import SSDVGG, HRNet, ResNet, ResNetV1d, ResNeXt
+from .dgcnn import DGCNNBackbone
+from .dla import DLANet
+from .mink_resnet import MinkResNet
+from .multi_backbone import MultiBackbone
+from .nostem_regnet import NoStemRegNet
+from .pointnet2_sa_msg import PointNet2SAMSG
+from .pointnet2_sa_ssg import PointNet2SASSG
+from .second import SECOND
+
+__all__ = [
+    'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'NoStemRegNet',
+    'SECOND', 'DGCNNBackbone', 'PointNet2SASSG', 'PointNet2SAMSG',
+    'MultiBackbone', 'DLANet', 'MinkResNet'
+]
diff --git a/mmdet3d/models/backbones/base_pointnet.py b/mmdet3d/models/backbones/base_pointnet.py
index 31439e6..68abff7 100644
--- a/mmdet3d/models/backbones/base_pointnet.py
+++ b/mmdet3d/models/backbones/base_pointnet.py
@@ -1,39 +1,39 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from abc import ABCMeta
-
-from mmcv.runner import BaseModule
-
-
-class BasePointNet(BaseModule, metaclass=ABCMeta):
-    """Base class for PointNet."""
-
-    def __init__(self, init_cfg=None, pretrained=None):
-        super(BasePointNet, self).__init__(init_cfg)
-        self.fp16_enabled = False
-        assert not (init_cfg and pretrained), \
-            'init_cfg and pretrained cannot be setting at the same time'
-        if isinstance(pretrained, str):
-            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
-                          'please use "init_cfg" instead')
-            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-
-    @staticmethod
-    def _split_point_feats(points):
-        """Split coordinates and features of input points.
-
-        Args:
-            points (torch.Tensor): Point coordinates with features,
-                with shape (B, N, 3 + input_feature_dim).
-
-        Returns:
-            torch.Tensor: Coordinates of input points.
-            torch.Tensor: Features of input points.
-        """
-        xyz = points[..., 0:3].contiguous()
-        if points.size(-1) > 3:
-            features = points[..., 3:].transpose(1, 2).contiguous()
-        else:
-            features = None
-
-        return xyz, features
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from abc import ABCMeta
+
+from mmcv.runner import BaseModule
+
+
+class BasePointNet(BaseModule, metaclass=ABCMeta):
+    """Base class for PointNet."""
+
+    def __init__(self, init_cfg=None, pretrained=None):
+        super(BasePointNet, self).__init__(init_cfg)
+        self.fp16_enabled = False
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+    @staticmethod
+    def _split_point_feats(points):
+        """Split coordinates and features of input points.
+
+        Args:
+            points (torch.Tensor): Point coordinates with features,
+                with shape (B, N, 3 + input_feature_dim).
+
+        Returns:
+            torch.Tensor: Coordinates of input points.
+            torch.Tensor: Features of input points.
+        """
+        xyz = points[..., 0:3].contiguous()
+        if points.size(-1) > 3:
+            features = points[..., 3:].transpose(1, 2).contiguous()
+        else:
+            features = None
+
+        return xyz, features
diff --git a/mmdet3d/models/backbones/dgcnn.py b/mmdet3d/models/backbones/dgcnn.py
index 20e82d9..b61f843 100644
--- a/mmdet3d/models/backbones/dgcnn.py
+++ b/mmdet3d/models/backbones/dgcnn.py
@@ -1,98 +1,98 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.runner import BaseModule, auto_fp16
-from torch import nn as nn
-
-from mmdet3d.ops import DGCNNFAModule, DGCNNGFModule
-from ..builder import BACKBONES
-
-
-@BACKBONES.register_module()
-class DGCNNBackbone(BaseModule):
-    """Backbone network for DGCNN.
-
-    Args:
-        in_channels (int): Input channels of point cloud.
-        num_samples (tuple[int], optional): The number of samples for knn or
-            ball query in each graph feature (GF) module.
-            Defaults to (20, 20, 20).
-        knn_modes (tuple[str], optional): Mode of KNN of each knn module.
-            Defaults to ('D-KNN', 'F-KNN', 'F-KNN').
-        radius (tuple[float], optional): Sampling radii of each GF module.
-            Defaults to (None, None, None).
-        gf_channels (tuple[tuple[int]], optional): Out channels of each mlp in
-            GF module. Defaults to ((64, 64), (64, 64), (64, )).
-        fa_channels (tuple[int], optional): Out channels of each mlp in FA
-            module. Defaults to (1024, ).
-        act_cfg (dict, optional): Config of activation layer.
-            Defaults to dict(type='ReLU').
-        init_cfg (dict, optional): Initialization config.
-            Defaults to None.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 num_samples=(20, 20, 20),
-                 knn_modes=('D-KNN', 'F-KNN', 'F-KNN'),
-                 radius=(None, None, None),
-                 gf_channels=((64, 64), (64, 64), (64, )),
-                 fa_channels=(1024, ),
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.num_gf = len(gf_channels)
-
-        assert len(num_samples) == len(knn_modes) == len(radius) == len(
-            gf_channels), 'Num_samples, knn_modes, radius and gf_channels \
-            should have the same length.'
-
-        self.GF_modules = nn.ModuleList()
-        gf_in_channel = in_channels * 2
-        skip_channel_list = [gf_in_channel]  # input channel list
-
-        for gf_index in range(self.num_gf):
-            cur_gf_mlps = list(gf_channels[gf_index])
-            cur_gf_mlps = [gf_in_channel] + cur_gf_mlps
-            gf_out_channel = cur_gf_mlps[-1]
-
-            self.GF_modules.append(
-                DGCNNGFModule(
-                    mlp_channels=cur_gf_mlps,
-                    num_sample=num_samples[gf_index],
-                    knn_mode=knn_modes[gf_index],
-                    radius=radius[gf_index],
-                    act_cfg=act_cfg))
-            skip_channel_list.append(gf_out_channel)
-            gf_in_channel = gf_out_channel * 2
-
-        fa_in_channel = sum(skip_channel_list[1:])
-        cur_fa_mlps = list(fa_channels)
-        cur_fa_mlps = [fa_in_channel] + cur_fa_mlps
-
-        self.FA_module = DGCNNFAModule(
-            mlp_channels=cur_fa_mlps, act_cfg=act_cfg)
-
-    @auto_fp16(apply_to=('points', ))
-    def forward(self, points):
-        """Forward pass.
-
-        Args:
-            points (torch.Tensor): point coordinates with features,
-                with shape (B, N, in_channels).
-
-        Returns:
-            dict[str, list[torch.Tensor]]: Outputs after graph feature (GF) and
-                feature aggregation (FA) modules.
-
-                - gf_points (list[torch.Tensor]): Outputs after each GF module.
-                - fa_points (torch.Tensor): Outputs after FA module.
-        """
-        gf_points = [points]
-
-        for i in range(self.num_gf):
-            cur_points = self.GF_modules[i](gf_points[i])
-            gf_points.append(cur_points)
-
-        fa_points = self.FA_module(gf_points)
-
-        out = dict(gf_points=gf_points, fa_points=fa_points)
-        return out
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import BaseModule, auto_fp16
+from torch import nn as nn
+
+from mmdet3d.ops import DGCNNFAModule, DGCNNGFModule
+from ..builder import BACKBONES
+
+
+@BACKBONES.register_module()
+class DGCNNBackbone(BaseModule):
+    """Backbone network for DGCNN.
+
+    Args:
+        in_channels (int): Input channels of point cloud.
+        num_samples (tuple[int], optional): The number of samples for knn or
+            ball query in each graph feature (GF) module.
+            Defaults to (20, 20, 20).
+        knn_modes (tuple[str], optional): Mode of KNN of each knn module.
+            Defaults to ('D-KNN', 'F-KNN', 'F-KNN').
+        radius (tuple[float], optional): Sampling radii of each GF module.
+            Defaults to (None, None, None).
+        gf_channels (tuple[tuple[int]], optional): Out channels of each mlp in
+            GF module. Defaults to ((64, 64), (64, 64), (64, )).
+        fa_channels (tuple[int], optional): Out channels of each mlp in FA
+            module. Defaults to (1024, ).
+        act_cfg (dict, optional): Config of activation layer.
+            Defaults to dict(type='ReLU').
+        init_cfg (dict, optional): Initialization config.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_samples=(20, 20, 20),
+                 knn_modes=('D-KNN', 'F-KNN', 'F-KNN'),
+                 radius=(None, None, None),
+                 gf_channels=((64, 64), (64, 64), (64, )),
+                 fa_channels=(1024, ),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_gf = len(gf_channels)
+
+        assert len(num_samples) == len(knn_modes) == len(radius) == len(
+            gf_channels), 'Num_samples, knn_modes, radius and gf_channels \
+            should have the same length.'
+
+        self.GF_modules = nn.ModuleList()
+        gf_in_channel = in_channels * 2
+        skip_channel_list = [gf_in_channel]  # input channel list
+
+        for gf_index in range(self.num_gf):
+            cur_gf_mlps = list(gf_channels[gf_index])
+            cur_gf_mlps = [gf_in_channel] + cur_gf_mlps
+            gf_out_channel = cur_gf_mlps[-1]
+
+            self.GF_modules.append(
+                DGCNNGFModule(
+                    mlp_channels=cur_gf_mlps,
+                    num_sample=num_samples[gf_index],
+                    knn_mode=knn_modes[gf_index],
+                    radius=radius[gf_index],
+                    act_cfg=act_cfg))
+            skip_channel_list.append(gf_out_channel)
+            gf_in_channel = gf_out_channel * 2
+
+        fa_in_channel = sum(skip_channel_list[1:])
+        cur_fa_mlps = list(fa_channels)
+        cur_fa_mlps = [fa_in_channel] + cur_fa_mlps
+
+        self.FA_module = DGCNNFAModule(
+            mlp_channels=cur_fa_mlps, act_cfg=act_cfg)
+
+    @auto_fp16(apply_to=('points', ))
+    def forward(self, points):
+        """Forward pass.
+
+        Args:
+            points (torch.Tensor): point coordinates with features,
+                with shape (B, N, in_channels).
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Outputs after graph feature (GF) and
+                feature aggregation (FA) modules.
+
+                - gf_points (list[torch.Tensor]): Outputs after each GF module.
+                - fa_points (torch.Tensor): Outputs after FA module.
+        """
+        gf_points = [points]
+
+        for i in range(self.num_gf):
+            cur_points = self.GF_modules[i](gf_points[i])
+            gf_points.append(cur_points)
+
+        fa_points = self.FA_module(gf_points)
+
+        out = dict(gf_points=gf_points, fa_points=fa_points)
+        return out
diff --git a/mmdet3d/models/backbones/dla.py b/mmdet3d/models/backbones/dla.py
index a547909..0be6d88 100644
--- a/mmdet3d/models/backbones/dla.py
+++ b/mmdet3d/models/backbones/dla.py
@@ -1,446 +1,446 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import torch
-from mmcv.cnn import build_conv_layer, build_norm_layer
-from mmcv.runner import BaseModule
-from torch import nn
-
-from ..builder import BACKBONES
-
-
-def dla_build_norm_layer(cfg, num_features):
-    """Build normalization layer specially designed for DLANet.
-
-    Args:
-        cfg (dict): The norm layer config, which should contain:
-
-            - type (str): Layer type.
-            - layer args: Args needed to instantiate a norm layer.
-            - requires_grad (bool, optional): Whether stop gradient updates.
-        num_features (int): Number of input channels.
-
-
-    Returns:
-        Function: Build normalization layer in mmcv.
-    """
-    cfg_ = cfg.copy()
-    if cfg_['type'] == 'GN':
-        if num_features % 32 == 0:
-            return build_norm_layer(cfg_, num_features)
-        else:
-            assert 'num_groups' in cfg_
-            cfg_['num_groups'] = cfg_['num_groups'] // 2
-            return build_norm_layer(cfg_, num_features)
-    else:
-        return build_norm_layer(cfg_, num_features)
-
-
-class BasicBlock(BaseModule):
-    """BasicBlock in DLANet.
-
-    Args:
-        in_channels (int): Input feature channel.
-        out_channels (int): Output feature channel.
-        norm_cfg (dict): Dictionary to construct and config
-            norm layer.
-        conv_cfg (dict): Dictionary to construct and config
-            conv layer.
-        stride (int, optional): Conv stride. Default: 1.
-        dilation (int, optional): Conv dilation. Default: 1.
-        init_cfg (dict, optional): Initialization config.
-            Default: None.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 norm_cfg,
-                 conv_cfg,
-                 stride=1,
-                 dilation=1,
-                 init_cfg=None):
-        super(BasicBlock, self).__init__(init_cfg)
-        self.conv1 = build_conv_layer(
-            conv_cfg,
-            in_channels,
-            out_channels,
-            3,
-            stride=stride,
-            padding=dilation,
-            dilation=dilation,
-            bias=False)
-        self.norm1 = dla_build_norm_layer(norm_cfg, out_channels)[1]
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = build_conv_layer(
-            conv_cfg,
-            out_channels,
-            out_channels,
-            3,
-            stride=1,
-            padding=dilation,
-            dilation=dilation,
-            bias=False)
-        self.norm2 = dla_build_norm_layer(norm_cfg, out_channels)[1]
-        self.stride = stride
-
-    def forward(self, x, identity=None):
-        """Forward function."""
-
-        if identity is None:
-            identity = x
-        out = self.conv1(x)
-        out = self.norm1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.norm2(out)
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-class Root(BaseModule):
-    """Root in DLANet.
-
-    Args:
-        in_channels (int): Input feature channel.
-        out_channels (int): Output feature channel.
-        norm_cfg (dict): Dictionary to construct and config
-            norm layer.
-        conv_cfg (dict): Dictionary to construct and config
-            conv layer.
-        kernel_size (int): Size of convolution kernel.
-        add_identity (bool): Whether to add identity in root.
-        init_cfg (dict, optional): Initialization config.
-            Default: None.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 norm_cfg,
-                 conv_cfg,
-                 kernel_size,
-                 add_identity,
-                 init_cfg=None):
-        super(Root, self).__init__(init_cfg)
-        self.conv = build_conv_layer(
-            conv_cfg,
-            in_channels,
-            out_channels,
-            1,
-            stride=1,
-            padding=(kernel_size - 1) // 2,
-            bias=False)
-        self.norm = dla_build_norm_layer(norm_cfg, out_channels)[1]
-        self.relu = nn.ReLU(inplace=True)
-        self.add_identity = add_identity
-
-    def forward(self, feat_list):
-        """Forward function.
-
-        Args:
-            feat_list (list[torch.Tensor]): Output features from
-                multiple layers.
-        """
-        children = feat_list
-        x = self.conv(torch.cat(feat_list, 1))
-        x = self.norm(x)
-        if self.add_identity:
-            x += children[0]
-        x = self.relu(x)
-
-        return x
-
-
-class Tree(BaseModule):
-    """Tree in DLANet.
-
-    Args:
-        levels (int): The level of the tree.
-        block (nn.Module): The block module in tree.
-        in_channels: Input feature channel.
-        out_channels: Output feature channel.
-        norm_cfg (dict): Dictionary to construct and config
-            norm layer.
-        conv_cfg (dict): Dictionary to construct and config
-            conv layer.
-        stride (int, optional): Convolution stride.
-            Default: 1.
-        level_root (bool, optional): whether belongs to the
-            root layer.
-        root_dim (int, optional): Root input feature channel.
-        root_kernel_size (int, optional): Size of root
-            convolution kernel. Default: 1.
-        dilation (int, optional): Conv dilation. Default: 1.
-        add_identity (bool, optional): Whether to add
-            identity in root. Default: False.
-        init_cfg (dict, optional): Initialization config.
-            Default: None.
-    """
-
-    def __init__(self,
-                 levels,
-                 block,
-                 in_channels,
-                 out_channels,
-                 norm_cfg,
-                 conv_cfg,
-                 stride=1,
-                 level_root=False,
-                 root_dim=None,
-                 root_kernel_size=1,
-                 dilation=1,
-                 add_identity=False,
-                 init_cfg=None):
-        super(Tree, self).__init__(init_cfg)
-        if root_dim is None:
-            root_dim = 2 * out_channels
-        if level_root:
-            root_dim += in_channels
-        if levels == 1:
-            self.root = Root(root_dim, out_channels, norm_cfg, conv_cfg,
-                             root_kernel_size, add_identity)
-            self.tree1 = block(
-                in_channels,
-                out_channels,
-                norm_cfg,
-                conv_cfg,
-                stride,
-                dilation=dilation)
-            self.tree2 = block(
-                out_channels,
-                out_channels,
-                norm_cfg,
-                conv_cfg,
-                1,
-                dilation=dilation)
-        else:
-            self.tree1 = Tree(
-                levels - 1,
-                block,
-                in_channels,
-                out_channels,
-                norm_cfg,
-                conv_cfg,
-                stride,
-                root_dim=None,
-                root_kernel_size=root_kernel_size,
-                dilation=dilation,
-                add_identity=add_identity)
-            self.tree2 = Tree(
-                levels - 1,
-                block,
-                out_channels,
-                out_channels,
-                norm_cfg,
-                conv_cfg,
-                root_dim=root_dim + out_channels,
-                root_kernel_size=root_kernel_size,
-                dilation=dilation,
-                add_identity=add_identity)
-        self.level_root = level_root
-        self.root_dim = root_dim
-        self.downsample = None
-        self.project = None
-        self.levels = levels
-        if stride > 1:
-            self.downsample = nn.MaxPool2d(stride, stride=stride)
-        if in_channels != out_channels:
-            self.project = nn.Sequential(
-                build_conv_layer(
-                    conv_cfg,
-                    in_channels,
-                    out_channels,
-                    1,
-                    stride=1,
-                    bias=False),
-                dla_build_norm_layer(norm_cfg, out_channels)[1])
-
-    def forward(self, x, identity=None, children=None):
-        children = [] if children is None else children
-        bottom = self.downsample(x) if self.downsample else x
-        identity = self.project(bottom) if self.project else bottom
-        if self.level_root:
-            children.append(bottom)
-        x1 = self.tree1(x, identity)
-        if self.levels == 1:
-            x2 = self.tree2(x1)
-            feat_list = [x2, x1] + children
-            x = self.root(feat_list)
-        else:
-            children.append(x1)
-            x = self.tree2(x1, children=children)
-        return x
-
-
-@BACKBONES.register_module()
-class DLANet(BaseModule):
-    r"""`DLA backbone <https://arxiv.org/abs/1707.06484>`_.
-
-    Args:
-        depth (int): Depth of DLA. Default: 34.
-        in_channels (int, optional): Number of input image channels.
-            Default: 3.
-        norm_cfg (dict, optional): Dictionary to construct and config
-            norm layer. Default: None.
-        conv_cfg (dict, optional): Dictionary to construct and config
-            conv layer. Default: None.
-        layer_with_level_root (list[bool], optional): Whether to apply
-            level_root in each DLA layer, this is only used for
-            tree levels. Default: (False, True, True, True).
-        with_identity_root (bool, optional): Whether to add identity
-            in root layer. Default: False.
-        pretrained (str, optional): model pretrained path.
-            Default: None.
-        init_cfg (dict or list[dict], optional): Initialization
-            config dict. Default: None
-    """
-    arch_settings = {
-        34: (BasicBlock, (1, 1, 1, 2, 2, 1), (16, 32, 64, 128, 256, 512)),
-    }
-
-    def __init__(self,
-                 depth,
-                 in_channels=3,
-                 out_indices=(0, 1, 2, 3, 4, 5),
-                 frozen_stages=-1,
-                 norm_cfg=None,
-                 conv_cfg=None,
-                 layer_with_level_root=(False, True, True, True),
-                 with_identity_root=False,
-                 pretrained=None,
-                 init_cfg=None):
-        super(DLANet, self).__init__(init_cfg)
-        if depth not in self.arch_settings:
-            raise KeyError(f'invalida depth {depth} for DLA')
-
-        assert not (init_cfg and pretrained), \
-            'init_cfg and pretrained cannot be setting at the same time'
-        if isinstance(pretrained, str):
-            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
-                          'please use "init_cfg" instead')
-            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-        elif pretrained is None:
-            if init_cfg is None:
-                self.init_cfg = [
-                    dict(type='Kaiming', layer='Conv2d'),
-                    dict(
-                        type='Constant',
-                        val=1,
-                        layer=['_BatchNorm', 'GroupNorm'])
-                ]
-
-        block, levels, channels = self.arch_settings[depth]
-        self.channels = channels
-        self.num_levels = len(levels)
-        self.frozen_stages = frozen_stages
-        self.out_indices = out_indices
-        assert max(out_indices) < self.num_levels
-        self.base_layer = nn.Sequential(
-            build_conv_layer(
-                conv_cfg,
-                in_channels,
-                channels[0],
-                7,
-                stride=1,
-                padding=3,
-                bias=False),
-            dla_build_norm_layer(norm_cfg, channels[0])[1],
-            nn.ReLU(inplace=True))
-
-        # DLANet first uses two conv layers then uses several
-        # Tree layers
-        for i in range(2):
-            level_layer = self._make_conv_level(
-                channels[0],
-                channels[i],
-                levels[i],
-                norm_cfg,
-                conv_cfg,
-                stride=i + 1)
-            layer_name = f'level{i}'
-            self.add_module(layer_name, level_layer)
-
-        for i in range(2, self.num_levels):
-            dla_layer = Tree(
-                levels[i],
-                block,
-                channels[i - 1],
-                channels[i],
-                norm_cfg,
-                conv_cfg,
-                2,
-                level_root=layer_with_level_root[i - 2],
-                add_identity=with_identity_root)
-            layer_name = f'level{i}'
-            self.add_module(layer_name, dla_layer)
-
-        self._freeze_stages()
-
-    def _make_conv_level(self,
-                         in_channels,
-                         out_channels,
-                         num_convs,
-                         norm_cfg,
-                         conv_cfg,
-                         stride=1,
-                         dilation=1):
-        """Conv modules.
-
-        Args:
-            in_channels (int): Input feature channel.
-            out_channels (int): Output feature channel.
-            num_convs (int): Number of Conv module.
-            norm_cfg (dict): Dictionary to construct and config
-                norm layer.
-            conv_cfg (dict): Dictionary to construct and config
-                conv layer.
-            stride (int, optional): Conv stride. Default: 1.
-            dilation (int, optional): Conv dilation. Default: 1.
-        """
-        modules = []
-        for i in range(num_convs):
-            modules.extend([
-                build_conv_layer(
-                    conv_cfg,
-                    in_channels,
-                    out_channels,
-                    3,
-                    stride=stride if i == 0 else 1,
-                    padding=dilation,
-                    bias=False,
-                    dilation=dilation),
-                dla_build_norm_layer(norm_cfg, out_channels)[1],
-                nn.ReLU(inplace=True)
-            ])
-            in_channels = out_channels
-        return nn.Sequential(*modules)
-
-    def _freeze_stages(self):
-        if self.frozen_stages >= 0:
-            self.base_layer.eval()
-            for param in self.base_layer.parameters():
-                param.requires_grad = False
-
-            for i in range(2):
-                m = getattr(self, f'level{i}')
-                m.eval()
-                for param in m.parameters():
-                    param.requires_grad = False
-
-        for i in range(1, self.frozen_stages + 1):
-            m = getattr(self, f'level{i+1}')
-            m.eval()
-            for param in m.parameters():
-                param.requires_grad = False
-
-    def forward(self, x):
-        outs = []
-        x = self.base_layer(x)
-        for i in range(self.num_levels):
-            x = getattr(self, 'level{}'.format(i))(x)
-            if i in self.out_indices:
-                outs.append(x)
-        return tuple(outs)
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule
+from torch import nn
+
+from ..builder import BACKBONES
+
+
+def dla_build_norm_layer(cfg, num_features):
+    """Build normalization layer specially designed for DLANet.
+
+    Args:
+        cfg (dict): The norm layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a norm layer.
+            - requires_grad (bool, optional): Whether stop gradient updates.
+        num_features (int): Number of input channels.
+
+
+    Returns:
+        Function: Build normalization layer in mmcv.
+    """
+    cfg_ = cfg.copy()
+    if cfg_['type'] == 'GN':
+        if num_features % 32 == 0:
+            return build_norm_layer(cfg_, num_features)
+        else:
+            assert 'num_groups' in cfg_
+            cfg_['num_groups'] = cfg_['num_groups'] // 2
+            return build_norm_layer(cfg_, num_features)
+    else:
+        return build_norm_layer(cfg_, num_features)
+
+
+class BasicBlock(BaseModule):
+    """BasicBlock in DLANet.
+
+    Args:
+        in_channels (int): Input feature channel.
+        out_channels (int): Output feature channel.
+        norm_cfg (dict): Dictionary to construct and config
+            norm layer.
+        conv_cfg (dict): Dictionary to construct and config
+            conv layer.
+        stride (int, optional): Conv stride. Default: 1.
+        dilation (int, optional): Conv dilation. Default: 1.
+        init_cfg (dict, optional): Initialization config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_cfg,
+                 conv_cfg,
+                 stride=1,
+                 dilation=1,
+                 init_cfg=None):
+        super(BasicBlock, self).__init__(init_cfg)
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            out_channels,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.norm1 = dla_build_norm_layer(norm_cfg, out_channels)[1]
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            out_channels,
+            out_channels,
+            3,
+            stride=1,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.norm2 = dla_build_norm_layer(norm_cfg, out_channels)[1]
+        self.stride = stride
+
+    def forward(self, x, identity=None):
+        """Forward function."""
+
+        if identity is None:
+            identity = x
+        out = self.conv1(x)
+        out = self.norm1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.norm2(out)
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Root(BaseModule):
+    """Root in DLANet.
+
+    Args:
+        in_channels (int): Input feature channel.
+        out_channels (int): Output feature channel.
+        norm_cfg (dict): Dictionary to construct and config
+            norm layer.
+        conv_cfg (dict): Dictionary to construct and config
+            conv layer.
+        kernel_size (int): Size of convolution kernel.
+        add_identity (bool): Whether to add identity in root.
+        init_cfg (dict, optional): Initialization config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_cfg,
+                 conv_cfg,
+                 kernel_size,
+                 add_identity,
+                 init_cfg=None):
+        super(Root, self).__init__(init_cfg)
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            out_channels,
+            1,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            bias=False)
+        self.norm = dla_build_norm_layer(norm_cfg, out_channels)[1]
+        self.relu = nn.ReLU(inplace=True)
+        self.add_identity = add_identity
+
+    def forward(self, feat_list):
+        """Forward function.
+
+        Args:
+            feat_list (list[torch.Tensor]): Output features from
+                multiple layers.
+        """
+        children = feat_list
+        x = self.conv(torch.cat(feat_list, 1))
+        x = self.norm(x)
+        if self.add_identity:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(BaseModule):
+    """Tree in DLANet.
+
+    Args:
+        levels (int): The level of the tree.
+        block (nn.Module): The block module in tree.
+        in_channels: Input feature channel.
+        out_channels: Output feature channel.
+        norm_cfg (dict): Dictionary to construct and config
+            norm layer.
+        conv_cfg (dict): Dictionary to construct and config
+            conv layer.
+        stride (int, optional): Convolution stride.
+            Default: 1.
+        level_root (bool, optional): whether belongs to the
+            root layer.
+        root_dim (int, optional): Root input feature channel.
+        root_kernel_size (int, optional): Size of root
+            convolution kernel. Default: 1.
+        dilation (int, optional): Conv dilation. Default: 1.
+        add_identity (bool, optional): Whether to add
+            identity in root. Default: False.
+        init_cfg (dict, optional): Initialization config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 levels,
+                 block,
+                 in_channels,
+                 out_channels,
+                 norm_cfg,
+                 conv_cfg,
+                 stride=1,
+                 level_root=False,
+                 root_dim=None,
+                 root_kernel_size=1,
+                 dilation=1,
+                 add_identity=False,
+                 init_cfg=None):
+        super(Tree, self).__init__(init_cfg)
+        if root_dim is None:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, norm_cfg, conv_cfg,
+                             root_kernel_size, add_identity)
+            self.tree1 = block(
+                in_channels,
+                out_channels,
+                norm_cfg,
+                conv_cfg,
+                stride,
+                dilation=dilation)
+            self.tree2 = block(
+                out_channels,
+                out_channels,
+                norm_cfg,
+                conv_cfg,
+                1,
+                dilation=dilation)
+        else:
+            self.tree1 = Tree(
+                levels - 1,
+                block,
+                in_channels,
+                out_channels,
+                norm_cfg,
+                conv_cfg,
+                stride,
+                root_dim=None,
+                root_kernel_size=root_kernel_size,
+                dilation=dilation,
+                add_identity=add_identity)
+            self.tree2 = Tree(
+                levels - 1,
+                block,
+                out_channels,
+                out_channels,
+                norm_cfg,
+                conv_cfg,
+                root_dim=root_dim + out_channels,
+                root_kernel_size=root_kernel_size,
+                dilation=dilation,
+                add_identity=add_identity)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                build_conv_layer(
+                    conv_cfg,
+                    in_channels,
+                    out_channels,
+                    1,
+                    stride=1,
+                    bias=False),
+                dla_build_norm_layer(norm_cfg, out_channels)[1])
+
+    def forward(self, x, identity=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        identity = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, identity)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            feat_list = [x2, x1] + children
+            x = self.root(feat_list)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+@BACKBONES.register_module()
+class DLANet(BaseModule):
+    r"""`DLA backbone <https://arxiv.org/abs/1707.06484>`_.
+
+    Args:
+        depth (int): Depth of DLA. Default: 34.
+        in_channels (int, optional): Number of input image channels.
+            Default: 3.
+        norm_cfg (dict, optional): Dictionary to construct and config
+            norm layer. Default: None.
+        conv_cfg (dict, optional): Dictionary to construct and config
+            conv layer. Default: None.
+        layer_with_level_root (list[bool], optional): Whether to apply
+            level_root in each DLA layer, this is only used for
+            tree levels. Default: (False, True, True, True).
+        with_identity_root (bool, optional): Whether to add identity
+            in root layer. Default: False.
+        pretrained (str, optional): model pretrained path.
+            Default: None.
+        init_cfg (dict or list[dict], optional): Initialization
+            config dict. Default: None
+    """
+    arch_settings = {
+        34: (BasicBlock, (1, 1, 1, 2, 2, 1), (16, 32, 64, 128, 256, 512)),
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 out_indices=(0, 1, 2, 3, 4, 5),
+                 frozen_stages=-1,
+                 norm_cfg=None,
+                 conv_cfg=None,
+                 layer_with_level_root=(False, True, True, True),
+                 with_identity_root=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super(DLANet, self).__init__(init_cfg)
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalida depth {depth} for DLA')
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+
+        block, levels, channels = self.arch_settings[depth]
+        self.channels = channels
+        self.num_levels = len(levels)
+        self.frozen_stages = frozen_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_levels
+        self.base_layer = nn.Sequential(
+            build_conv_layer(
+                conv_cfg,
+                in_channels,
+                channels[0],
+                7,
+                stride=1,
+                padding=3,
+                bias=False),
+            dla_build_norm_layer(norm_cfg, channels[0])[1],
+            nn.ReLU(inplace=True))
+
+        # DLANet first uses two conv layers then uses several
+        # Tree layers
+        for i in range(2):
+            level_layer = self._make_conv_level(
+                channels[0],
+                channels[i],
+                levels[i],
+                norm_cfg,
+                conv_cfg,
+                stride=i + 1)
+            layer_name = f'level{i}'
+            self.add_module(layer_name, level_layer)
+
+        for i in range(2, self.num_levels):
+            dla_layer = Tree(
+                levels[i],
+                block,
+                channels[i - 1],
+                channels[i],
+                norm_cfg,
+                conv_cfg,
+                2,
+                level_root=layer_with_level_root[i - 2],
+                add_identity=with_identity_root)
+            layer_name = f'level{i}'
+            self.add_module(layer_name, dla_layer)
+
+        self._freeze_stages()
+
+    def _make_conv_level(self,
+                         in_channels,
+                         out_channels,
+                         num_convs,
+                         norm_cfg,
+                         conv_cfg,
+                         stride=1,
+                         dilation=1):
+        """Conv modules.
+
+        Args:
+            in_channels (int): Input feature channel.
+            out_channels (int): Output feature channel.
+            num_convs (int): Number of Conv module.
+            norm_cfg (dict): Dictionary to construct and config
+                norm layer.
+            conv_cfg (dict): Dictionary to construct and config
+                conv layer.
+            stride (int, optional): Conv stride. Default: 1.
+            dilation (int, optional): Conv dilation. Default: 1.
+        """
+        modules = []
+        for i in range(num_convs):
+            modules.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=stride if i == 0 else 1,
+                    padding=dilation,
+                    bias=False,
+                    dilation=dilation),
+                dla_build_norm_layer(norm_cfg, out_channels)[1],
+                nn.ReLU(inplace=True)
+            ])
+            in_channels = out_channels
+        return nn.Sequential(*modules)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.base_layer.eval()
+            for param in self.base_layer.parameters():
+                param.requires_grad = False
+
+            for i in range(2):
+                m = getattr(self, f'level{i}')
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'level{i+1}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        outs = []
+        x = self.base_layer(x)
+        for i in range(self.num_levels):
+            x = getattr(self, 'level{}'.format(i))(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmdet3d/models/backbones/mink_resnet.py b/mmdet3d/models/backbones/mink_resnet.py
index 35a79ce..e5d39f9 100644
--- a/mmdet3d/models/backbones/mink_resnet.py
+++ b/mmdet3d/models/backbones/mink_resnet.py
@@ -1,116 +1,116 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# Follow https://github.com/NVIDIA/MinkowskiEngine/blob/master/examples/resnet.py # noqa
-# and mmcv.cnn.ResNet
-try:
-    import MinkowskiEngine as ME
-    from MinkowskiEngine.modules.resnet_block import BasicBlock, Bottleneck
-except ImportError:
-    import warnings
-    warnings.warn(
-        'Please follow `getting_started.md` to install MinkowskiEngine.`')
-    # blocks are used in the static part of MinkResNet
-    BasicBlock, Bottleneck = None, None
-
-import torch.nn as nn
-
-from mmdet3d.models.builder import BACKBONES
-
-
-@BACKBONES.register_module()
-class MinkResNet(nn.Module):
-    r"""Minkowski ResNet backbone. See `4D Spatio-Temporal ConvNets
-    <https://arxiv.org/abs/1904.08755>`_ for more details.
-
-    Args:
-        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
-        in_channels (ont): Number of input channels, 3 for RGB.
-        num_stages (int, optional): Resnet stages. Default: 4.
-        pool (bool, optional): Add max pooling after first conv if True.
-            Default: True.
-    """
-    arch_settings = {
-        18: (BasicBlock, (2, 2, 2, 2)),
-        34: (BasicBlock, (3, 4, 6, 3)),
-        50: (Bottleneck, (3, 4, 6, 3)),
-        101: (Bottleneck, (3, 4, 23, 3)),
-        152: (Bottleneck, (3, 8, 36, 3))
-    }
-
-    def __init__(self, depth, in_channels, num_stages=4, pool=True):
-        super(MinkResNet, self).__init__()
-        if depth not in self.arch_settings:
-            raise KeyError(f'invalid depth {depth} for resnet')
-        assert 4 >= num_stages >= 1
-        block, stage_blocks = self.arch_settings[depth]
-        stage_blocks = stage_blocks[:num_stages]
-        self.num_stages = num_stages
-        self.pool = pool
-
-        self.inplanes = 64
-        self.conv1 = ME.MinkowskiConvolution(
-            in_channels, self.inplanes, kernel_size=3, stride=2, dimension=3)
-        # May be BatchNorm is better, but we follow original implementation.
-        self.norm1 = ME.MinkowskiInstanceNorm(self.inplanes)
-        self.relu = ME.MinkowskiReLU(inplace=True)
-        if self.pool:
-            self.maxpool = ME.MinkowskiMaxPooling(
-                kernel_size=2, stride=2, dimension=3)
-
-        for i, num_blocks in enumerate(stage_blocks):
-            setattr(
-                self, f'layer{i}',
-                self._make_layer(block, 64 * 2**i, stage_blocks[i], stride=2))
-
-    def init_weights(self):
-        for m in self.modules():
-            if isinstance(m, ME.MinkowskiConvolution):
-                ME.utils.kaiming_normal_(
-                    m.kernel, mode='fan_out', nonlinearity='relu')
-
-            if isinstance(m, ME.MinkowskiBatchNorm):
-                nn.init.constant_(m.bn.weight, 1)
-                nn.init.constant_(m.bn.bias, 0)
-
-    def _make_layer(self, block, planes, blocks, stride):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                ME.MinkowskiConvolution(
-                    self.inplanes,
-                    planes * block.expansion,
-                    kernel_size=1,
-                    stride=stride,
-                    dimension=3),
-                ME.MinkowskiBatchNorm(planes * block.expansion))
-        layers = []
-        layers.append(
-            block(
-                self.inplanes,
-                planes,
-                stride=stride,
-                downsample=downsample,
-                dimension=3))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(self.inplanes, planes, stride=1, dimension=3))
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        """Forward pass of ResNet.
-
-        Args:
-            x (ME.SparseTensor): Input sparse tensor.
-
-        Returns:
-            list[ME.SparseTensor]: Output sparse tensors.
-        """
-        x = self.conv1(x)
-        x = self.norm1(x)
-        x = self.relu(x)
-        if self.pool:
-            x = self.maxpool(x)
-        outs = []
-        for i in range(self.num_stages):
-            x = getattr(self, f'layer{i}')(x)
-            outs.append(x)
-        return outs
+# Copyright (c) OpenMMLab. All rights reserved.
+# Follow https://github.com/NVIDIA/MinkowskiEngine/blob/master/examples/resnet.py # noqa
+# and mmcv.cnn.ResNet
+try:
+    import MinkowskiEngine as ME
+    from MinkowskiEngine.modules.resnet_block import BasicBlock, Bottleneck
+except ImportError:
+    import warnings
+    warnings.warn(
+        'Please follow `getting_started.md` to install MinkowskiEngine.`')
+    # blocks are used in the static part of MinkResNet
+    BasicBlock, Bottleneck = None, None
+
+import torch.nn as nn
+
+from mmdet3d.models.builder import BACKBONES
+
+
+@BACKBONES.register_module()
+class MinkResNet(nn.Module):
+    r"""Minkowski ResNet backbone. See `4D Spatio-Temporal ConvNets
+    <https://arxiv.org/abs/1904.08755>`_ for more details.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        in_channels (ont): Number of input channels, 3 for RGB.
+        num_stages (int, optional): Resnet stages. Default: 4.
+        pool (bool, optional): Add max pooling after first conv if True.
+            Default: True.
+    """
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self, depth, in_channels, num_stages=4, pool=True):
+        super(MinkResNet, self).__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        assert 4 >= num_stages >= 1
+        block, stage_blocks = self.arch_settings[depth]
+        stage_blocks = stage_blocks[:num_stages]
+        self.num_stages = num_stages
+        self.pool = pool
+
+        self.inplanes = 64
+        self.conv1 = ME.MinkowskiConvolution(
+            in_channels, self.inplanes, kernel_size=3, stride=2, dimension=3)
+        # May be BatchNorm is better, but we follow original implementation.
+        self.norm1 = ME.MinkowskiInstanceNorm(self.inplanes)
+        self.relu = ME.MinkowskiReLU(inplace=True)
+        if self.pool:
+            self.maxpool = ME.MinkowskiMaxPooling(
+                kernel_size=2, stride=2, dimension=3)
+
+        for i, num_blocks in enumerate(stage_blocks):
+            setattr(
+                self, f'layer{i}',
+                self._make_layer(block, 64 * 2**i, stage_blocks[i], stride=2))
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, ME.MinkowskiConvolution):
+                ME.utils.kaiming_normal_(
+                    m.kernel, mode='fan_out', nonlinearity='relu')
+
+            if isinstance(m, ME.MinkowskiBatchNorm):
+                nn.init.constant_(m.bn.weight, 1)
+                nn.init.constant_(m.bn.bias, 0)
+
+    def _make_layer(self, block, planes, blocks, stride):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                ME.MinkowskiConvolution(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    dimension=3),
+                ME.MinkowskiBatchNorm(planes * block.expansion))
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride=stride,
+                downsample=downsample,
+                dimension=3))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, stride=1, dimension=3))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Forward pass of ResNet.
+
+        Args:
+            x (ME.SparseTensor): Input sparse tensor.
+
+        Returns:
+            list[ME.SparseTensor]: Output sparse tensors.
+        """
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        if self.pool:
+            x = self.maxpool(x)
+        outs = []
+        for i in range(self.num_stages):
+            x = getattr(self, f'layer{i}')(x)
+            outs.append(x)
+        return outs
diff --git a/mmdet3d/models/backbones/multi_backbone.py b/mmdet3d/models/backbones/multi_backbone.py
index ed04ecd..8f80610 100644
--- a/mmdet3d/models/backbones/multi_backbone.py
+++ b/mmdet3d/models/backbones/multi_backbone.py
@@ -1,127 +1,127 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import warnings
-
-import torch
-from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule, auto_fp16
-from torch import nn as nn
-
-from ..builder import BACKBONES, build_backbone
-
-
-@BACKBONES.register_module()
-class MultiBackbone(BaseModule):
-    """MultiBackbone with different configs.
-
-    Args:
-        num_streams (int): The number of backbones.
-        backbones (list or dict): A list of backbone configs.
-        aggregation_mlp_channels (list[int]): Specify the mlp layers
-            for feature aggregation.
-        conv_cfg (dict): Config dict of convolutional layers.
-        norm_cfg (dict): Config dict of normalization layers.
-        act_cfg (dict): Config dict of activation layers.
-        suffixes (list): A list of suffixes to rename the return dict
-            for each backbone.
-    """
-
-    def __init__(self,
-                 num_streams,
-                 backbones,
-                 aggregation_mlp_channels=None,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
-                 act_cfg=dict(type='ReLU'),
-                 suffixes=('net0', 'net1'),
-                 init_cfg=None,
-                 pretrained=None,
-                 **kwargs):
-        super().__init__(init_cfg=init_cfg)
-        assert isinstance(backbones, dict) or isinstance(backbones, list)
-        if isinstance(backbones, dict):
-            backbones_list = []
-            for ind in range(num_streams):
-                backbones_list.append(copy.deepcopy(backbones))
-            backbones = backbones_list
-
-        assert len(backbones) == num_streams
-        assert len(suffixes) == num_streams
-
-        self.backbone_list = nn.ModuleList()
-        # Rename the ret_dict with different suffixs.
-        self.suffixes = suffixes
-
-        out_channels = 0
-
-        for backbone_cfg in backbones:
-            out_channels += backbone_cfg['fp_channels'][-1][-1]
-            self.backbone_list.append(build_backbone(backbone_cfg))
-
-        # Feature aggregation layers
-        if aggregation_mlp_channels is None:
-            aggregation_mlp_channels = [
-                out_channels, out_channels // 2,
-                out_channels // len(self.backbone_list)
-            ]
-        else:
-            aggregation_mlp_channels.insert(0, out_channels)
-
-        self.aggregation_layers = nn.Sequential()
-        for i in range(len(aggregation_mlp_channels) - 1):
-            self.aggregation_layers.add_module(
-                f'layer{i}',
-                ConvModule(
-                    aggregation_mlp_channels[i],
-                    aggregation_mlp_channels[i + 1],
-                    1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
-                    bias=True,
-                    inplace=True))
-
-        assert not (init_cfg and pretrained), \
-            'init_cfg and pretrained cannot be setting at the same time'
-        if isinstance(pretrained, str):
-            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
-                          'please use "init_cfg" instead')
-            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-
-    @auto_fp16()
-    def forward(self, points):
-        """Forward pass.
-
-        Args:
-            points (torch.Tensor): point coordinates with features,
-                with shape (B, N, 3 + input_feature_dim).
-
-        Returns:
-            dict[str, list[torch.Tensor]]: Outputs from multiple backbones.
-
-                - fp_xyz[suffix] (list[torch.Tensor]): The coordinates of
-                  each fp features.
-                - fp_features[suffix] (list[torch.Tensor]): The features
-                  from each Feature Propagate Layers.
-                - fp_indices[suffix] (list[torch.Tensor]): Indices of the
-                  input points.
-                - hd_feature (torch.Tensor): The aggregation feature
-                  from multiple backbones.
-        """
-        ret = {}
-        fp_features = []
-        for ind in range(len(self.backbone_list)):
-            cur_ret = self.backbone_list[ind](points)
-            cur_suffix = self.suffixes[ind]
-            fp_features.append(cur_ret['fp_features'][-1])
-            if cur_suffix != '':
-                for k in cur_ret.keys():
-                    cur_ret[k + '_' + cur_suffix] = cur_ret.pop(k)
-            ret.update(cur_ret)
-
-        # Combine the features here
-        hd_feature = torch.cat(fp_features, dim=1)
-        hd_feature = self.aggregation_layers(hd_feature)
-        ret['hd_feature'] = hd_feature
-        return ret
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, auto_fp16
+from torch import nn as nn
+
+from ..builder import BACKBONES, build_backbone
+
+
+@BACKBONES.register_module()
+class MultiBackbone(BaseModule):
+    """MultiBackbone with different configs.
+
+    Args:
+        num_streams (int): The number of backbones.
+        backbones (list or dict): A list of backbone configs.
+        aggregation_mlp_channels (list[int]): Specify the mlp layers
+            for feature aggregation.
+        conv_cfg (dict): Config dict of convolutional layers.
+        norm_cfg (dict): Config dict of normalization layers.
+        act_cfg (dict): Config dict of activation layers.
+        suffixes (list): A list of suffixes to rename the return dict
+            for each backbone.
+    """
+
+    def __init__(self,
+                 num_streams,
+                 backbones,
+                 aggregation_mlp_channels=None,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
+                 act_cfg=dict(type='ReLU'),
+                 suffixes=('net0', 'net1'),
+                 init_cfg=None,
+                 pretrained=None,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg)
+        assert isinstance(backbones, dict) or isinstance(backbones, list)
+        if isinstance(backbones, dict):
+            backbones_list = []
+            for ind in range(num_streams):
+                backbones_list.append(copy.deepcopy(backbones))
+            backbones = backbones_list
+
+        assert len(backbones) == num_streams
+        assert len(suffixes) == num_streams
+
+        self.backbone_list = nn.ModuleList()
+        # Rename the ret_dict with different suffixs.
+        self.suffixes = suffixes
+
+        out_channels = 0
+
+        for backbone_cfg in backbones:
+            out_channels += backbone_cfg['fp_channels'][-1][-1]
+            self.backbone_list.append(build_backbone(backbone_cfg))
+
+        # Feature aggregation layers
+        if aggregation_mlp_channels is None:
+            aggregation_mlp_channels = [
+                out_channels, out_channels // 2,
+                out_channels // len(self.backbone_list)
+            ]
+        else:
+            aggregation_mlp_channels.insert(0, out_channels)
+
+        self.aggregation_layers = nn.Sequential()
+        for i in range(len(aggregation_mlp_channels) - 1):
+            self.aggregation_layers.add_module(
+                f'layer{i}',
+                ConvModule(
+                    aggregation_mlp_channels[i],
+                    aggregation_mlp_channels[i + 1],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    bias=True,
+                    inplace=True))
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+    @auto_fp16()
+    def forward(self, points):
+        """Forward pass.
+
+        Args:
+            points (torch.Tensor): point coordinates with features,
+                with shape (B, N, 3 + input_feature_dim).
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Outputs from multiple backbones.
+
+                - fp_xyz[suffix] (list[torch.Tensor]): The coordinates of
+                  each fp features.
+                - fp_features[suffix] (list[torch.Tensor]): The features
+                  from each Feature Propagate Layers.
+                - fp_indices[suffix] (list[torch.Tensor]): Indices of the
+                  input points.
+                - hd_feature (torch.Tensor): The aggregation feature
+                  from multiple backbones.
+        """
+        ret = {}
+        fp_features = []
+        for ind in range(len(self.backbone_list)):
+            cur_ret = self.backbone_list[ind](points)
+            cur_suffix = self.suffixes[ind]
+            fp_features.append(cur_ret['fp_features'][-1])
+            if cur_suffix != '':
+                for k in cur_ret.keys():
+                    cur_ret[k + '_' + cur_suffix] = cur_ret.pop(k)
+            ret.update(cur_ret)
+
+        # Combine the features here
+        hd_feature = torch.cat(fp_features, dim=1)
+        hd_feature = self.aggregation_layers(hd_feature)
+        ret['hd_feature'] = hd_feature
+        return ret
diff --git a/mmdet3d/models/backbones/nostem_regnet.py b/mmdet3d/models/backbones/nostem_regnet.py
index 3090508..c39a7ad 100644
--- a/mmdet3d/models/backbones/nostem_regnet.py
+++ b/mmdet3d/models/backbones/nostem_regnet.py
@@ -1,84 +1,84 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.models.backbones import RegNet
-from ..builder import BACKBONES
-
-
-@BACKBONES.register_module()
-class NoStemRegNet(RegNet):
-    """RegNet backbone without Stem for 3D detection.
-
-    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`_ .
-
-    Args:
-        arch (dict): The parameter of RegNets.
-            - w0 (int): Initial width.
-            - wa (float): Slope of width.
-            - wm (float): Quantization parameter to quantize the width.
-            - depth (int): Depth of the backbone.
-            - group_w (int): Width of group.
-            - bot_mul (float): Bottleneck ratio, i.e. expansion of bottleneck.
-        strides (Sequence[int]): Strides of the first block of each stage.
-        base_channels (int): Base channels after stem layer.
-        in_channels (int): Number of input image channels. Normally 3.
-        dilations (Sequence[int]): Dilation of each stage.
-        out_indices (Sequence[int]): Output from which stages.
-        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
-            layer is the 3x3 conv layer, otherwise the stride-two layer is
-            the first 1x1 conv layer.
-        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
-            not freezing any parameters.
-        norm_cfg (dict): Dictionary to construct and config norm layer.
-        norm_eval (bool): Whether to set norm layers to eval mode, namely,
-            freeze running stats (mean and var). Note: Effect on Batch Norm
-            and its variants only.
-        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
-            memory while slowing down the training speed.
-        zero_init_residual (bool): Whether to use zero init for last norm layer
-            in resblocks to let them behave as identity.
-
-    Example:
-        >>> from mmdet3d.models import NoStemRegNet
-        >>> import torch
-        >>> self = NoStemRegNet(
-                arch=dict(
-                    w0=88,
-                    wa=26.31,
-                    wm=2.25,
-                    group_w=48,
-                    depth=25,
-                    bot_mul=1.0))
-        >>> self.eval()
-        >>> inputs = torch.rand(1, 64, 16, 16)
-        >>> level_outputs = self.forward(inputs)
-        >>> for level_out in level_outputs:
-        ...     print(tuple(level_out.shape))
-        (1, 96, 8, 8)
-        (1, 192, 4, 4)
-        (1, 432, 2, 2)
-        (1, 1008, 1, 1)
-    """
-
-    def __init__(self, arch, init_cfg=None, **kwargs):
-        super(NoStemRegNet, self).__init__(arch, init_cfg=init_cfg, **kwargs)
-
-    def _make_stem_layer(self, in_channels, base_channels):
-        """Override the original function that do not initialize a stem layer
-        since 3D detector's voxel encoder works like a stem layer."""
-        return
-
-    def forward(self, x):
-        """Forward function of backbone.
-
-        Args:
-            x (torch.Tensor): Features in shape (N, C, H, W).
-
-        Returns:
-            tuple[torch.Tensor]: Multi-scale features.
-        """
-        outs = []
-        for i, layer_name in enumerate(self.res_layers):
-            res_layer = getattr(self, layer_name)
-            x = res_layer(x)
-            if i in self.out_indices:
-                outs.append(x)
-        return tuple(outs)
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.backbones import RegNet
+from ..builder import BACKBONES
+
+
+@BACKBONES.register_module()
+class NoStemRegNet(RegNet):
+    """RegNet backbone without Stem for 3D detection.
+
+    More details can be found in `paper <https://arxiv.org/abs/2003.13678>`_ .
+
+    Args:
+        arch (dict): The parameter of RegNets.
+            - w0 (int): Initial width.
+            - wa (float): Slope of width.
+            - wm (float): Quantization parameter to quantize the width.
+            - depth (int): Depth of the backbone.
+            - group_w (int): Width of group.
+            - bot_mul (float): Bottleneck ratio, i.e. expansion of bottleneck.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        base_channels (int): Base channels after stem layer.
+        in_channels (int): Number of input image channels. Normally 3.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+
+    Example:
+        >>> from mmdet3d.models import NoStemRegNet
+        >>> import torch
+        >>> self = NoStemRegNet(
+                arch=dict(
+                    w0=88,
+                    wa=26.31,
+                    wm=2.25,
+                    group_w=48,
+                    depth=25,
+                    bot_mul=1.0))
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 64, 16, 16)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 96, 8, 8)
+        (1, 192, 4, 4)
+        (1, 432, 2, 2)
+        (1, 1008, 1, 1)
+    """
+
+    def __init__(self, arch, init_cfg=None, **kwargs):
+        super(NoStemRegNet, self).__init__(arch, init_cfg=init_cfg, **kwargs)
+
+    def _make_stem_layer(self, in_channels, base_channels):
+        """Override the original function that do not initialize a stem layer
+        since 3D detector's voxel encoder works like a stem layer."""
+        return
+
+    def forward(self, x):
+        """Forward function of backbone.
+
+        Args:
+            x (torch.Tensor): Features in shape (N, C, H, W).
+
+        Returns:
+            tuple[torch.Tensor]: Multi-scale features.
+        """
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
diff --git a/mmdet3d/models/backbones/pointnet2_sa_msg.py b/mmdet3d/models/backbones/pointnet2_sa_msg.py
index f6b1e47..ed4ce6c 100644
--- a/mmdet3d/models/backbones/pointnet2_sa_msg.py
+++ b/mmdet3d/models/backbones/pointnet2_sa_msg.py
@@ -1,175 +1,175 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import ConvModule
-from mmcv.runner import auto_fp16
-from torch import nn as nn
-
-from mmdet3d.ops import build_sa_module
-from ..builder import BACKBONES
-from .base_pointnet import BasePointNet
-
-
-@BACKBONES.register_module()
-class PointNet2SAMSG(BasePointNet):
-    """PointNet2 with Multi-scale grouping.
-
-    Args:
-        in_channels (int): Input channels of point cloud.
-        num_points (tuple[int]): The number of points which each SA
-            module samples.
-        radii (tuple[float]): Sampling radii of each SA module.
-        num_samples (tuple[int]): The number of samples for ball
-            query in each SA module.
-        sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module.
-        aggregation_channels (tuple[int]): Out channels of aggregation
-            multi-scale grouping features.
-        fps_mods (tuple[int]): Mod of FPS for each SA module.
-        fps_sample_range_lists (tuple[tuple[int]]): The number of sampling
-            points which each SA module samples.
-        dilated_group (tuple[bool]): Whether to use dilated ball query for
-        out_indices (Sequence[int]): Output from which stages.
-        norm_cfg (dict): Config of normalization layer.
-        sa_cfg (dict): Config of set abstraction module, which may contain
-            the following keys and values:
-
-            - pool_mod (str): Pool method ('max' or 'avg') for SA modules.
-            - use_xyz (bool): Whether to use xyz as a part of features.
-            - normalize_xyz (bool): Whether to normalize xyz with radii in
-              each SA module.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 num_points=(2048, 1024, 512, 256),
-                 radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
-                 num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
-                 sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
-                              ((64, 64, 128), (64, 64, 128), (64, 96, 128)),
-                              ((128, 128, 256), (128, 192, 256), (128, 256,
-                                                                  256))),
-                 aggregation_channels=(64, 128, 256),
-                 fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
-                 fps_sample_range_lists=((-1), (-1), (512, -1)),
-                 dilated_group=(True, True, True),
-                 out_indices=(2, ),
-                 norm_cfg=dict(type='BN2d'),
-                 sa_cfg=dict(
-                     type='PointSAModuleMSG',
-                     pool_mod='max',
-                     use_xyz=True,
-                     normalize_xyz=False),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.num_sa = len(sa_channels)
-        self.out_indices = out_indices
-        assert max(out_indices) < self.num_sa
-        assert len(num_points) == len(radii) == len(num_samples) == len(
-            sa_channels)
-        if aggregation_channels is not None:
-            assert len(sa_channels) == len(aggregation_channels)
-        else:
-            aggregation_channels = [None] * len(sa_channels)
-
-        self.SA_modules = nn.ModuleList()
-        self.aggregation_mlps = nn.ModuleList()
-        sa_in_channel = in_channels - 3  # number of channels without xyz
-        skip_channel_list = [sa_in_channel]
-
-        for sa_index in range(self.num_sa):
-            cur_sa_mlps = list(sa_channels[sa_index])
-            sa_out_channel = 0
-            for radius_index in range(len(radii[sa_index])):
-                cur_sa_mlps[radius_index] = [sa_in_channel] + list(
-                    cur_sa_mlps[radius_index])
-                sa_out_channel += cur_sa_mlps[radius_index][-1]
-
-            if isinstance(fps_mods[sa_index], tuple):
-                cur_fps_mod = list(fps_mods[sa_index])
-            else:
-                cur_fps_mod = list([fps_mods[sa_index]])
-
-            if isinstance(fps_sample_range_lists[sa_index], tuple):
-                cur_fps_sample_range_list = list(
-                    fps_sample_range_lists[sa_index])
-            else:
-                cur_fps_sample_range_list = list(
-                    [fps_sample_range_lists[sa_index]])
-
-            self.SA_modules.append(
-                build_sa_module(
-                    num_point=num_points[sa_index],
-                    radii=radii[sa_index],
-                    sample_nums=num_samples[sa_index],
-                    mlp_channels=cur_sa_mlps,
-                    fps_mod=cur_fps_mod,
-                    fps_sample_range_list=cur_fps_sample_range_list,
-                    dilated_group=dilated_group[sa_index],
-                    norm_cfg=norm_cfg,
-                    cfg=sa_cfg,
-                    bias=True))
-            skip_channel_list.append(sa_out_channel)
-
-            cur_aggregation_channel = aggregation_channels[sa_index]
-            if cur_aggregation_channel is None:
-                self.aggregation_mlps.append(None)
-                sa_in_channel = sa_out_channel
-            else:
-                self.aggregation_mlps.append(
-                    ConvModule(
-                        sa_out_channel,
-                        cur_aggregation_channel,
-                        conv_cfg=dict(type='Conv1d'),
-                        norm_cfg=dict(type='BN1d'),
-                        kernel_size=1,
-                        bias=True))
-                sa_in_channel = cur_aggregation_channel
-
-    @auto_fp16(apply_to=('points', ))
-    def forward(self, points):
-        """Forward pass.
-
-        Args:
-            points (torch.Tensor): point coordinates with features,
-                with shape (B, N, 3 + input_feature_dim).
-
-        Returns:
-            dict[str, torch.Tensor]: Outputs of the last SA module.
-
-                - sa_xyz (torch.Tensor): The coordinates of sa features.
-                - sa_features (torch.Tensor): The features from the
-                    last Set Aggregation Layers.
-                - sa_indices (torch.Tensor): Indices of the
-                    input points.
-        """
-        xyz, features = self._split_point_feats(points)
-
-        batch, num_points = xyz.shape[:2]
-        indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat(
-            batch, 1).long()
-
-        sa_xyz = [xyz]
-        sa_features = [features]
-        sa_indices = [indices]
-
-        out_sa_xyz = [xyz]
-        out_sa_features = [features]
-        out_sa_indices = [indices]
-
-        for i in range(self.num_sa):
-            cur_xyz, cur_features, cur_indices = self.SA_modules[i](
-                sa_xyz[i], sa_features[i])
-            if self.aggregation_mlps[i] is not None:
-                cur_features = self.aggregation_mlps[i](cur_features)
-            sa_xyz.append(cur_xyz)
-            sa_features.append(cur_features)
-            sa_indices.append(
-                torch.gather(sa_indices[-1], 1, cur_indices.long()))
-            if i in self.out_indices:
-                out_sa_xyz.append(sa_xyz[-1])
-                out_sa_features.append(sa_features[-1])
-                out_sa_indices.append(sa_indices[-1])
-
-        return dict(
-            sa_xyz=out_sa_xyz,
-            sa_features=out_sa_features,
-            sa_indices=out_sa_indices)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.runner import auto_fp16
+from torch import nn as nn
+
+from mmdet3d.ops import build_sa_module
+from ..builder import BACKBONES
+from .base_pointnet import BasePointNet
+
+
+@BACKBONES.register_module()
+class PointNet2SAMSG(BasePointNet):
+    """PointNet2 with Multi-scale grouping.
+
+    Args:
+        in_channels (int): Input channels of point cloud.
+        num_points (tuple[int]): The number of points which each SA
+            module samples.
+        radii (tuple[float]): Sampling radii of each SA module.
+        num_samples (tuple[int]): The number of samples for ball
+            query in each SA module.
+        sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module.
+        aggregation_channels (tuple[int]): Out channels of aggregation
+            multi-scale grouping features.
+        fps_mods (tuple[int]): Mod of FPS for each SA module.
+        fps_sample_range_lists (tuple[tuple[int]]): The number of sampling
+            points which each SA module samples.
+        dilated_group (tuple[bool]): Whether to use dilated ball query for
+        out_indices (Sequence[int]): Output from which stages.
+        norm_cfg (dict): Config of normalization layer.
+        sa_cfg (dict): Config of set abstraction module, which may contain
+            the following keys and values:
+
+            - pool_mod (str): Pool method ('max' or 'avg') for SA modules.
+            - use_xyz (bool): Whether to use xyz as a part of features.
+            - normalize_xyz (bool): Whether to normalize xyz with radii in
+              each SA module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_points=(2048, 1024, 512, 256),
+                 radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
+                 num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
+                 sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
+                              ((64, 64, 128), (64, 64, 128), (64, 96, 128)),
+                              ((128, 128, 256), (128, 192, 256), (128, 256,
+                                                                  256))),
+                 aggregation_channels=(64, 128, 256),
+                 fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
+                 fps_sample_range_lists=((-1), (-1), (512, -1)),
+                 dilated_group=(True, True, True),
+                 out_indices=(2, ),
+                 norm_cfg=dict(type='BN2d'),
+                 sa_cfg=dict(
+                     type='PointSAModuleMSG',
+                     pool_mod='max',
+                     use_xyz=True,
+                     normalize_xyz=False),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_sa = len(sa_channels)
+        self.out_indices = out_indices
+        assert max(out_indices) < self.num_sa
+        assert len(num_points) == len(radii) == len(num_samples) == len(
+            sa_channels)
+        if aggregation_channels is not None:
+            assert len(sa_channels) == len(aggregation_channels)
+        else:
+            aggregation_channels = [None] * len(sa_channels)
+
+        self.SA_modules = nn.ModuleList()
+        self.aggregation_mlps = nn.ModuleList()
+        sa_in_channel = in_channels - 3  # number of channels without xyz
+        skip_channel_list = [sa_in_channel]
+
+        for sa_index in range(self.num_sa):
+            cur_sa_mlps = list(sa_channels[sa_index])
+            sa_out_channel = 0
+            for radius_index in range(len(radii[sa_index])):
+                cur_sa_mlps[radius_index] = [sa_in_channel] + list(
+                    cur_sa_mlps[radius_index])
+                sa_out_channel += cur_sa_mlps[radius_index][-1]
+
+            if isinstance(fps_mods[sa_index], tuple):
+                cur_fps_mod = list(fps_mods[sa_index])
+            else:
+                cur_fps_mod = list([fps_mods[sa_index]])
+
+            if isinstance(fps_sample_range_lists[sa_index], tuple):
+                cur_fps_sample_range_list = list(
+                    fps_sample_range_lists[sa_index])
+            else:
+                cur_fps_sample_range_list = list(
+                    [fps_sample_range_lists[sa_index]])
+
+            self.SA_modules.append(
+                build_sa_module(
+                    num_point=num_points[sa_index],
+                    radii=radii[sa_index],
+                    sample_nums=num_samples[sa_index],
+                    mlp_channels=cur_sa_mlps,
+                    fps_mod=cur_fps_mod,
+                    fps_sample_range_list=cur_fps_sample_range_list,
+                    dilated_group=dilated_group[sa_index],
+                    norm_cfg=norm_cfg,
+                    cfg=sa_cfg,
+                    bias=True))
+            skip_channel_list.append(sa_out_channel)
+
+            cur_aggregation_channel = aggregation_channels[sa_index]
+            if cur_aggregation_channel is None:
+                self.aggregation_mlps.append(None)
+                sa_in_channel = sa_out_channel
+            else:
+                self.aggregation_mlps.append(
+                    ConvModule(
+                        sa_out_channel,
+                        cur_aggregation_channel,
+                        conv_cfg=dict(type='Conv1d'),
+                        norm_cfg=dict(type='BN1d'),
+                        kernel_size=1,
+                        bias=True))
+                sa_in_channel = cur_aggregation_channel
+
+    @auto_fp16(apply_to=('points', ))
+    def forward(self, points):
+        """Forward pass.
+
+        Args:
+            points (torch.Tensor): point coordinates with features,
+                with shape (B, N, 3 + input_feature_dim).
+
+        Returns:
+            dict[str, torch.Tensor]: Outputs of the last SA module.
+
+                - sa_xyz (torch.Tensor): The coordinates of sa features.
+                - sa_features (torch.Tensor): The features from the
+                    last Set Aggregation Layers.
+                - sa_indices (torch.Tensor): Indices of the
+                    input points.
+        """
+        xyz, features = self._split_point_feats(points)
+
+        batch, num_points = xyz.shape[:2]
+        indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat(
+            batch, 1).long()
+
+        sa_xyz = [xyz]
+        sa_features = [features]
+        sa_indices = [indices]
+
+        out_sa_xyz = [xyz]
+        out_sa_features = [features]
+        out_sa_indices = [indices]
+
+        for i in range(self.num_sa):
+            cur_xyz, cur_features, cur_indices = self.SA_modules[i](
+                sa_xyz[i], sa_features[i])
+            if self.aggregation_mlps[i] is not None:
+                cur_features = self.aggregation_mlps[i](cur_features)
+            sa_xyz.append(cur_xyz)
+            sa_features.append(cur_features)
+            sa_indices.append(
+                torch.gather(sa_indices[-1], 1, cur_indices.long()))
+            if i in self.out_indices:
+                out_sa_xyz.append(sa_xyz[-1])
+                out_sa_features.append(sa_features[-1])
+                out_sa_indices.append(sa_indices[-1])
+
+        return dict(
+            sa_xyz=out_sa_xyz,
+            sa_features=out_sa_features,
+            sa_indices=out_sa_indices)
diff --git a/mmdet3d/models/backbones/pointnet2_sa_ssg.py b/mmdet3d/models/backbones/pointnet2_sa_ssg.py
index c7b4152..33a65c0 100644
--- a/mmdet3d/models/backbones/pointnet2_sa_ssg.py
+++ b/mmdet3d/models/backbones/pointnet2_sa_ssg.py
@@ -1,143 +1,143 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.runner import auto_fp16
-from torch import nn as nn
-
-from mmdet3d.ops import PointFPModule, build_sa_module
-from ..builder import BACKBONES
-from .base_pointnet import BasePointNet
-
-
-@BACKBONES.register_module()
-class PointNet2SASSG(BasePointNet):
-    """PointNet2 with Single-scale grouping.
-
-    Args:
-        in_channels (int): Input channels of point cloud.
-        num_points (tuple[int]): The number of points which each SA
-            module samples.
-        radius (tuple[float]): Sampling radii of each SA module.
-        num_samples (tuple[int]): The number of samples for ball
-            query in each SA module.
-        sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module.
-        fp_channels (tuple[tuple[int]]): Out channels of each mlp in FP module.
-        norm_cfg (dict): Config of normalization layer.
-        sa_cfg (dict): Config of set abstraction module, which may contain
-            the following keys and values:
-
-            - pool_mod (str): Pool method ('max' or 'avg') for SA modules.
-            - use_xyz (bool): Whether to use xyz as a part of features.
-            - normalize_xyz (bool): Whether to normalize xyz with radii in
-              each SA module.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 num_points=(2048, 1024, 512, 256),
-                 radius=(0.2, 0.4, 0.8, 1.2),
-                 num_samples=(64, 32, 16, 16),
-                 sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
-                              (128, 128, 256)),
-                 fp_channels=((256, 256), (256, 256)),
-                 norm_cfg=dict(type='BN2d'),
-                 sa_cfg=dict(
-                     type='PointSAModule',
-                     pool_mod='max',
-                     use_xyz=True,
-                     normalize_xyz=True),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.num_sa = len(sa_channels)
-        self.num_fp = len(fp_channels)
-
-        assert len(num_points) == len(radius) == len(num_samples) == len(
-            sa_channels)
-        assert len(sa_channels) >= len(fp_channels)
-
-        self.SA_modules = nn.ModuleList()
-        sa_in_channel = in_channels - 3  # number of channels without xyz
-        skip_channel_list = [sa_in_channel]
-
-        for sa_index in range(self.num_sa):
-            cur_sa_mlps = list(sa_channels[sa_index])
-            cur_sa_mlps = [sa_in_channel] + cur_sa_mlps
-            sa_out_channel = cur_sa_mlps[-1]
-
-            self.SA_modules.append(
-                build_sa_module(
-                    num_point=num_points[sa_index],
-                    radius=radius[sa_index],
-                    num_sample=num_samples[sa_index],
-                    mlp_channels=cur_sa_mlps,
-                    norm_cfg=norm_cfg,
-                    cfg=sa_cfg))
-            skip_channel_list.append(sa_out_channel)
-            sa_in_channel = sa_out_channel
-
-        self.FP_modules = nn.ModuleList()
-
-        fp_source_channel = skip_channel_list.pop()
-        fp_target_channel = skip_channel_list.pop()
-        for fp_index in range(len(fp_channels)):
-            cur_fp_mlps = list(fp_channels[fp_index])
-            cur_fp_mlps = [fp_source_channel + fp_target_channel] + cur_fp_mlps
-            self.FP_modules.append(PointFPModule(mlp_channels=cur_fp_mlps))
-            if fp_index != len(fp_channels) - 1:
-                fp_source_channel = cur_fp_mlps[-1]
-                fp_target_channel = skip_channel_list.pop()
-
-    @auto_fp16(apply_to=('points', ))
-    def forward(self, points):
-        """Forward pass.
-
-        Args:
-            points (torch.Tensor): point coordinates with features,
-                with shape (B, N, 3 + input_feature_dim).
-
-        Returns:
-            dict[str, list[torch.Tensor]]: Outputs after SA and FP modules.
-
-                - fp_xyz (list[torch.Tensor]): The coordinates of
-                    each fp features.
-                - fp_features (list[torch.Tensor]): The features
-                    from each Feature Propagate Layers.
-                - fp_indices (list[torch.Tensor]): Indices of the
-                    input points.
-        """
-        xyz, features = self._split_point_feats(points)
-
-        batch, num_points = xyz.shape[:2]
-        indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat(
-            batch, 1).long()
-
-        sa_xyz = [xyz]
-        sa_features = [features]
-        sa_indices = [indices]
-
-        for i in range(self.num_sa):
-            cur_xyz, cur_features, cur_indices = self.SA_modules[i](
-                sa_xyz[i], sa_features[i])
-            sa_xyz.append(cur_xyz)
-            sa_features.append(cur_features)
-            sa_indices.append(
-                torch.gather(sa_indices[-1], 1, cur_indices.long()))
-
-        fp_xyz = [sa_xyz[-1]]
-        fp_features = [sa_features[-1]]
-        fp_indices = [sa_indices[-1]]
-
-        for i in range(self.num_fp):
-            fp_features.append(self.FP_modules[i](
-                sa_xyz[self.num_sa - i - 1], sa_xyz[self.num_sa - i],
-                sa_features[self.num_sa - i - 1], fp_features[-1]))
-            fp_xyz.append(sa_xyz[self.num_sa - i - 1])
-            fp_indices.append(sa_indices[self.num_sa - i - 1])
-
-        ret = dict(
-            fp_xyz=fp_xyz,
-            fp_features=fp_features,
-            fp_indices=fp_indices,
-            sa_xyz=sa_xyz,
-            sa_features=sa_features,
-            sa_indices=sa_indices)
-        return ret
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import auto_fp16
+from torch import nn as nn
+
+from mmdet3d.ops import PointFPModule, build_sa_module
+from ..builder import BACKBONES
+from .base_pointnet import BasePointNet
+
+
+@BACKBONES.register_module()
+class PointNet2SASSG(BasePointNet):
+    """PointNet2 with Single-scale grouping.
+
+    Args:
+        in_channels (int): Input channels of point cloud.
+        num_points (tuple[int]): The number of points which each SA
+            module samples.
+        radius (tuple[float]): Sampling radii of each SA module.
+        num_samples (tuple[int]): The number of samples for ball
+            query in each SA module.
+        sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module.
+        fp_channels (tuple[tuple[int]]): Out channels of each mlp in FP module.
+        norm_cfg (dict): Config of normalization layer.
+        sa_cfg (dict): Config of set abstraction module, which may contain
+            the following keys and values:
+
+            - pool_mod (str): Pool method ('max' or 'avg') for SA modules.
+            - use_xyz (bool): Whether to use xyz as a part of features.
+            - normalize_xyz (bool): Whether to normalize xyz with radii in
+              each SA module.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_points=(2048, 1024, 512, 256),
+                 radius=(0.2, 0.4, 0.8, 1.2),
+                 num_samples=(64, 32, 16, 16),
+                 sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                              (128, 128, 256)),
+                 fp_channels=((256, 256), (256, 256)),
+                 norm_cfg=dict(type='BN2d'),
+                 sa_cfg=dict(
+                     type='PointSAModule',
+                     pool_mod='max',
+                     use_xyz=True,
+                     normalize_xyz=True),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_sa = len(sa_channels)
+        self.num_fp = len(fp_channels)
+
+        assert len(num_points) == len(radius) == len(num_samples) == len(
+            sa_channels)
+        assert len(sa_channels) >= len(fp_channels)
+
+        self.SA_modules = nn.ModuleList()
+        sa_in_channel = in_channels - 3  # number of channels without xyz
+        skip_channel_list = [sa_in_channel]
+
+        for sa_index in range(self.num_sa):
+            cur_sa_mlps = list(sa_channels[sa_index])
+            cur_sa_mlps = [sa_in_channel] + cur_sa_mlps
+            sa_out_channel = cur_sa_mlps[-1]
+
+            self.SA_modules.append(
+                build_sa_module(
+                    num_point=num_points[sa_index],
+                    radius=radius[sa_index],
+                    num_sample=num_samples[sa_index],
+                    mlp_channels=cur_sa_mlps,
+                    norm_cfg=norm_cfg,
+                    cfg=sa_cfg))
+            skip_channel_list.append(sa_out_channel)
+            sa_in_channel = sa_out_channel
+
+        self.FP_modules = nn.ModuleList()
+
+        fp_source_channel = skip_channel_list.pop()
+        fp_target_channel = skip_channel_list.pop()
+        for fp_index in range(len(fp_channels)):
+            cur_fp_mlps = list(fp_channels[fp_index])
+            cur_fp_mlps = [fp_source_channel + fp_target_channel] + cur_fp_mlps
+            self.FP_modules.append(PointFPModule(mlp_channels=cur_fp_mlps))
+            if fp_index != len(fp_channels) - 1:
+                fp_source_channel = cur_fp_mlps[-1]
+                fp_target_channel = skip_channel_list.pop()
+
+    @auto_fp16(apply_to=('points', ))
+    def forward(self, points):
+        """Forward pass.
+
+        Args:
+            points (torch.Tensor): point coordinates with features,
+                with shape (B, N, 3 + input_feature_dim).
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Outputs after SA and FP modules.
+
+                - fp_xyz (list[torch.Tensor]): The coordinates of
+                    each fp features.
+                - fp_features (list[torch.Tensor]): The features
+                    from each Feature Propagate Layers.
+                - fp_indices (list[torch.Tensor]): Indices of the
+                    input points.
+        """
+        xyz, features = self._split_point_feats(points)
+
+        batch, num_points = xyz.shape[:2]
+        indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat(
+            batch, 1).long()
+
+        sa_xyz = [xyz]
+        sa_features = [features]
+        sa_indices = [indices]
+
+        for i in range(self.num_sa):
+            cur_xyz, cur_features, cur_indices = self.SA_modules[i](
+                sa_xyz[i], sa_features[i])
+            sa_xyz.append(cur_xyz)
+            sa_features.append(cur_features)
+            sa_indices.append(
+                torch.gather(sa_indices[-1], 1, cur_indices.long()))
+
+        fp_xyz = [sa_xyz[-1]]
+        fp_features = [sa_features[-1]]
+        fp_indices = [sa_indices[-1]]
+
+        for i in range(self.num_fp):
+            fp_features.append(self.FP_modules[i](
+                sa_xyz[self.num_sa - i - 1], sa_xyz[self.num_sa - i],
+                sa_features[self.num_sa - i - 1], fp_features[-1]))
+            fp_xyz.append(sa_xyz[self.num_sa - i - 1])
+            fp_indices.append(sa_indices[self.num_sa - i - 1])
+
+        ret = dict(
+            fp_xyz=fp_xyz,
+            fp_features=fp_features,
+            fp_indices=fp_indices,
+            sa_xyz=sa_xyz,
+            sa_features=sa_features,
+            sa_indices=sa_indices)
+        return ret
diff --git a/mmdet3d/models/backbones/second.py b/mmdet3d/models/backbones/second.py
index 680dbbe..b629071 100644
--- a/mmdet3d/models/backbones/second.py
+++ b/mmdet3d/models/backbones/second.py
@@ -1,91 +1,91 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-from mmcv.cnn import build_conv_layer, build_norm_layer
-from mmcv.runner import BaseModule
-from torch import nn as nn
-
-from ..builder import BACKBONES
-
-
-@BACKBONES.register_module()
-class SECOND(BaseModule):
-    """Backbone network for SECOND/PointPillars/PartA2/MVXNet.
-
-    Args:
-        in_channels (int): Input channels.
-        out_channels (list[int]): Output channels for multi-scale feature maps.
-        layer_nums (list[int]): Number of layers in each stage.
-        layer_strides (list[int]): Strides of each stage.
-        norm_cfg (dict): Config dict of normalization layers.
-        conv_cfg (dict): Config dict of convolutional layers.
-    """
-
-    def __init__(self,
-                 in_channels=128,
-                 out_channels=[128, 128, 256],
-                 layer_nums=[3, 5, 5],
-                 layer_strides=[2, 2, 2],
-                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
-                 conv_cfg=dict(type='Conv2d', bias=False),
-                 init_cfg=None,
-                 pretrained=None):
-        super(SECOND, self).__init__(init_cfg=init_cfg)
-        assert len(layer_strides) == len(layer_nums)
-        assert len(out_channels) == len(layer_nums)
-
-        in_filters = [in_channels, *out_channels[:-1]]
-        # note that when stride > 1, conv2d with same padding isn't
-        # equal to pad-conv2d. we should use pad-conv2d.
-        blocks = []
-        for i, layer_num in enumerate(layer_nums):
-            block = [
-                build_conv_layer(
-                    conv_cfg,
-                    in_filters[i],
-                    out_channels[i],
-                    3,
-                    stride=layer_strides[i],
-                    padding=1),
-                build_norm_layer(norm_cfg, out_channels[i])[1],
-                nn.ReLU(inplace=True),
-            ]
-            for j in range(layer_num):
-                block.append(
-                    build_conv_layer(
-                        conv_cfg,
-                        out_channels[i],
-                        out_channels[i],
-                        3,
-                        padding=1))
-                block.append(build_norm_layer(norm_cfg, out_channels[i])[1])
-                block.append(nn.ReLU(inplace=True))
-
-            block = nn.Sequential(*block)
-            blocks.append(block)
-
-        self.blocks = nn.ModuleList(blocks)
-
-        assert not (init_cfg and pretrained), \
-            'init_cfg and pretrained cannot be setting at the same time'
-        if isinstance(pretrained, str):
-            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
-                          'please use "init_cfg" instead')
-            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-        else:
-            self.init_cfg = dict(type='Kaiming', layer='Conv2d')
-
-    def forward(self, x):
-        """Forward function.
-
-        Args:
-            x (torch.Tensor): Input with shape (N, C, H, W).
-
-        Returns:
-            tuple[torch.Tensor]: Multi-scale features.
-        """
-        outs = []
-        for i in range(len(self.blocks)):
-            x = self.blocks[i](x)
-            outs.append(x)
-        return tuple(outs)
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule
+from torch import nn as nn
+
+from ..builder import BACKBONES
+
+
+@BACKBONES.register_module()
+class SECOND(BaseModule):
+    """Backbone network for SECOND/PointPillars/PartA2/MVXNet.
+
+    Args:
+        in_channels (int): Input channels.
+        out_channels (list[int]): Output channels for multi-scale feature maps.
+        layer_nums (list[int]): Number of layers in each stage.
+        layer_strides (list[int]): Strides of each stage.
+        norm_cfg (dict): Config dict of normalization layers.
+        conv_cfg (dict): Config dict of convolutional layers.
+    """
+
+    def __init__(self,
+                 in_channels=128,
+                 out_channels=[128, 128, 256],
+                 layer_nums=[3, 5, 5],
+                 layer_strides=[2, 2, 2],
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+                 conv_cfg=dict(type='Conv2d', bias=False),
+                 init_cfg=None,
+                 pretrained=None):
+        super(SECOND, self).__init__(init_cfg=init_cfg)
+        assert len(layer_strides) == len(layer_nums)
+        assert len(out_channels) == len(layer_nums)
+
+        in_filters = [in_channels, *out_channels[:-1]]
+        # note that when stride > 1, conv2d with same padding isn't
+        # equal to pad-conv2d. we should use pad-conv2d.
+        blocks = []
+        for i, layer_num in enumerate(layer_nums):
+            block = [
+                build_conv_layer(
+                    conv_cfg,
+                    in_filters[i],
+                    out_channels[i],
+                    3,
+                    stride=layer_strides[i],
+                    padding=1),
+                build_norm_layer(norm_cfg, out_channels[i])[1],
+                nn.ReLU(inplace=True),
+            ]
+            for j in range(layer_num):
+                block.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        out_channels[i],
+                        out_channels[i],
+                        3,
+                        padding=1))
+                block.append(build_norm_layer(norm_cfg, out_channels[i])[1])
+                block.append(nn.ReLU(inplace=True))
+
+            block = nn.Sequential(*block)
+            blocks.append(block)
+
+        self.blocks = nn.ModuleList(blocks)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        else:
+            self.init_cfg = dict(type='Kaiming', layer='Conv2d')
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): Input with shape (N, C, H, W).
+
+        Returns:
+            tuple[torch.Tensor]: Multi-scale features.
+        """
+        outs = []
+        for i in range(len(self.blocks)):
+            x = self.blocks[i](x)
+            outs.append(x)
+        return tuple(outs)
diff --git a/mmdet3d/models/builder.py b/mmdet3d/models/builder.py
index fb8b8c2..03ea22a 100644
--- a/mmdet3d/models/builder.py
+++ b/mmdet3d/models/builder.py
@@ -1,137 +1,137 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-from mmcv.cnn import MODELS as MMCV_MODELS
-from mmcv.utils import Registry
-
-from mmdet.models.builder import BACKBONES as MMDET_BACKBONES
-from mmdet.models.builder import DETECTORS as MMDET_DETECTORS
-from mmdet.models.builder import HEADS as MMDET_HEADS
-from mmdet.models.builder import LOSSES as MMDET_LOSSES
-from mmdet.models.builder import NECKS as MMDET_NECKS
-from mmdet.models.builder import ROI_EXTRACTORS as MMDET_ROI_EXTRACTORS
-from mmdet.models.builder import SHARED_HEADS as MMDET_SHARED_HEADS
-from mmseg.models.builder import LOSSES as MMSEG_LOSSES
-
-MODELS = Registry('models', parent=MMCV_MODELS)
-
-BACKBONES = MODELS
-NECKS = MODELS
-ROI_EXTRACTORS = MODELS
-SHARED_HEADS = MODELS
-HEADS = MODELS
-LOSSES = MODELS
-DETECTORS = MODELS
-VOXEL_ENCODERS = MODELS
-MIDDLE_ENCODERS = MODELS
-FUSION_LAYERS = MODELS
-SEGMENTORS = MODELS
-
-
-def build_backbone(cfg):
-    """Build backbone."""
-    if cfg['type'] in BACKBONES._module_dict.keys():
-        return BACKBONES.build(cfg)
-    else:
-        return MMDET_BACKBONES.build(cfg)
-
-
-def build_neck(cfg):
-    """Build neck."""
-    if cfg['type'] in NECKS._module_dict.keys():
-        return NECKS.build(cfg)
-    else:
-        return MMDET_NECKS.build(cfg)
-
-
-def build_roi_extractor(cfg):
-    """Build RoI feature extractor."""
-    if cfg['type'] in ROI_EXTRACTORS._module_dict.keys():
-        return ROI_EXTRACTORS.build(cfg)
-    else:
-        return MMDET_ROI_EXTRACTORS.build(cfg)
-
-
-def build_shared_head(cfg):
-    """Build shared head of detector."""
-    if cfg['type'] in SHARED_HEADS._module_dict.keys():
-        return SHARED_HEADS.build(cfg)
-    else:
-        return MMDET_SHARED_HEADS.build(cfg)
-
-
-def build_head(cfg):
-    """Build head."""
-    if cfg['type'] in HEADS._module_dict.keys():
-        return HEADS.build(cfg)
-    else:
-        return MMDET_HEADS.build(cfg)
-
-
-def build_loss(cfg):
-    """Build loss function."""
-    if cfg['type'] in LOSSES._module_dict.keys():
-        return LOSSES.build(cfg)
-    elif cfg['type'] in MMDET_LOSSES._module_dict.keys():
-        return MMDET_LOSSES.build(cfg)
-    else:
-        return MMSEG_LOSSES.build(cfg)
-
-
-def build_detector(cfg, train_cfg=None, test_cfg=None):
-    """Build detector."""
-    if train_cfg is not None or test_cfg is not None:
-        warnings.warn(
-            'train_cfg and test_cfg is deprecated, '
-            'please specify them in model', UserWarning)
-    assert cfg.get('train_cfg') is None or train_cfg is None, \
-        'train_cfg specified in both outer field and model field '
-    assert cfg.get('test_cfg') is None or test_cfg is None, \
-        'test_cfg specified in both outer field and model field '
-    if cfg['type'] in DETECTORS._module_dict.keys():
-        return DETECTORS.build(
-            cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
-    else:
-        return MMDET_DETECTORS.build(
-            cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
-
-
-def build_segmentor(cfg, train_cfg=None, test_cfg=None):
-    """Build segmentor."""
-    if train_cfg is not None or test_cfg is not None:
-        warnings.warn(
-            'train_cfg and test_cfg is deprecated, '
-            'please specify them in model', UserWarning)
-    assert cfg.get('train_cfg') is None or train_cfg is None, \
-        'train_cfg specified in both outer field and model field '
-    assert cfg.get('test_cfg') is None or test_cfg is None, \
-        'test_cfg specified in both outer field and model field '
-    return SEGMENTORS.build(
-        cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
-
-
-def build_model(cfg, train_cfg=None, test_cfg=None):
-    """A function warpper for building 3D detector or segmentor according to
-    cfg.
-
-    Should be deprecated in the future.
-    """
-    if cfg.type in ['EncoderDecoder3D']:
-        return build_segmentor(cfg, train_cfg=train_cfg, test_cfg=test_cfg)
-    else:
-        return build_detector(cfg, train_cfg=train_cfg, test_cfg=test_cfg)
-
-
-def build_voxel_encoder(cfg):
-    """Build voxel encoder."""
-    return VOXEL_ENCODERS.build(cfg)
-
-
-def build_middle_encoder(cfg):
-    """Build middle level encoder."""
-    return MIDDLE_ENCODERS.build(cfg)
-
-
-def build_fusion_layer(cfg):
-    """Build fusion layer."""
-    return FUSION_LAYERS.build(cfg)
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.utils import Registry
+
+from mmdet.models.builder import BACKBONES as MMDET_BACKBONES
+from mmdet.models.builder import DETECTORS as MMDET_DETECTORS
+from mmdet.models.builder import HEADS as MMDET_HEADS
+from mmdet.models.builder import LOSSES as MMDET_LOSSES
+from mmdet.models.builder import NECKS as MMDET_NECKS
+from mmdet.models.builder import ROI_EXTRACTORS as MMDET_ROI_EXTRACTORS
+from mmdet.models.builder import SHARED_HEADS as MMDET_SHARED_HEADS
+from mmseg.models.builder import LOSSES as MMSEG_LOSSES
+
+MODELS = Registry('models', parent=MMCV_MODELS)
+
+BACKBONES = MODELS
+NECKS = MODELS
+ROI_EXTRACTORS = MODELS
+SHARED_HEADS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+DETECTORS = MODELS
+VOXEL_ENCODERS = MODELS
+MIDDLE_ENCODERS = MODELS
+FUSION_LAYERS = MODELS
+SEGMENTORS = MODELS
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    if cfg['type'] in BACKBONES._module_dict.keys():
+        return BACKBONES.build(cfg)
+    else:
+        return MMDET_BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    if cfg['type'] in NECKS._module_dict.keys():
+        return NECKS.build(cfg)
+    else:
+        return MMDET_NECKS.build(cfg)
+
+
+def build_roi_extractor(cfg):
+    """Build RoI feature extractor."""
+    if cfg['type'] in ROI_EXTRACTORS._module_dict.keys():
+        return ROI_EXTRACTORS.build(cfg)
+    else:
+        return MMDET_ROI_EXTRACTORS.build(cfg)
+
+
+def build_shared_head(cfg):
+    """Build shared head of detector."""
+    if cfg['type'] in SHARED_HEADS._module_dict.keys():
+        return SHARED_HEADS.build(cfg)
+    else:
+        return MMDET_SHARED_HEADS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    if cfg['type'] in HEADS._module_dict.keys():
+        return HEADS.build(cfg)
+    else:
+        return MMDET_HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss function."""
+    if cfg['type'] in LOSSES._module_dict.keys():
+        return LOSSES.build(cfg)
+    elif cfg['type'] in MMDET_LOSSES._module_dict.keys():
+        return MMDET_LOSSES.build(cfg)
+    else:
+        return MMSEG_LOSSES.build(cfg)
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+    """Build detector."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn(
+            'train_cfg and test_cfg is deprecated, '
+            'please specify them in model', UserWarning)
+    assert cfg.get('train_cfg') is None or train_cfg is None, \
+        'train_cfg specified in both outer field and model field '
+    assert cfg.get('test_cfg') is None or test_cfg is None, \
+        'test_cfg specified in both outer field and model field '
+    if cfg['type'] in DETECTORS._module_dict.keys():
+        return DETECTORS.build(
+            cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
+    else:
+        return MMDET_DETECTORS.build(
+            cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
+
+
+def build_segmentor(cfg, train_cfg=None, test_cfg=None):
+    """Build segmentor."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn(
+            'train_cfg and test_cfg is deprecated, '
+            'please specify them in model', UserWarning)
+    assert cfg.get('train_cfg') is None or train_cfg is None, \
+        'train_cfg specified in both outer field and model field '
+    assert cfg.get('test_cfg') is None or test_cfg is None, \
+        'test_cfg specified in both outer field and model field '
+    return SEGMENTORS.build(
+        cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
+
+
+def build_model(cfg, train_cfg=None, test_cfg=None):
+    """A function warpper for building 3D detector or segmentor according to
+    cfg.
+
+    Should be deprecated in the future.
+    """
+    if cfg.type in ['EncoderDecoder3D']:
+        return build_segmentor(cfg, train_cfg=train_cfg, test_cfg=test_cfg)
+    else:
+        return build_detector(cfg, train_cfg=train_cfg, test_cfg=test_cfg)
+
+
+def build_voxel_encoder(cfg):
+    """Build voxel encoder."""
+    return VOXEL_ENCODERS.build(cfg)
+
+
+def build_middle_encoder(cfg):
+    """Build middle level encoder."""
+    return MIDDLE_ENCODERS.build(cfg)
+
+
+def build_fusion_layer(cfg):
+    """Build fusion layer."""
+    return FUSION_LAYERS.build(cfg)
diff --git a/mmdet3d/models/decode_heads/__init__.py b/mmdet3d/models/decode_heads/__init__.py
index 2e86c7c..c9a4390 100644
--- a/mmdet3d/models/decode_heads/__init__.py
+++ b/mmdet3d/models/decode_heads/__init__.py
@@ -1,6 +1,6 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .dgcnn_head import DGCNNHead
-from .paconv_head import PAConvHead
-from .pointnet2_head import PointNet2Head
-
-__all__ = ['PointNet2Head', 'DGCNNHead', 'PAConvHead']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dgcnn_head import DGCNNHead
+from .paconv_head import PAConvHead
+from .pointnet2_head import PointNet2Head
+
+__all__ = ['PointNet2Head', 'DGCNNHead', 'PAConvHead']
diff --git a/mmdet3d/models/decode_heads/decode_head.py b/mmdet3d/models/decode_heads/decode_head.py
index 6ccbfe0..03e84aa 100644
--- a/mmdet3d/models/decode_heads/decode_head.py
+++ b/mmdet3d/models/decode_heads/decode_head.py
@@ -1,123 +1,123 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import ABCMeta, abstractmethod
-
-from mmcv.cnn import normal_init
-from mmcv.runner import BaseModule, auto_fp16, force_fp32
-from torch import nn as nn
-
-from mmseg.models.builder import build_loss
-
-
-class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
-    """Base class for BaseDecodeHead.
-
-    Args:
-        channels (int): Channels after modules, before conv_seg.
-        num_classes (int): Number of classes.
-        dropout_ratio (float, optional): Ratio of dropout layer. Default: 0.5.
-        conv_cfg (dict, optional): Config of conv layers.
-            Default: dict(type='Conv1d').
-        norm_cfg (dict, optional): Config of norm layers.
-            Default: dict(type='BN1d').
-        act_cfg (dict, optional): Config of activation layers.
-            Default: dict(type='ReLU').
-        loss_decode (dict, optional): Config of decode loss.
-            Default: dict(type='CrossEntropyLoss').
-        ignore_index (int, optional): The label index to be ignored.
-            When using masked BCE loss, ignore_index should be set to None.
-            Default: 255.
-    """
-
-    def __init__(self,
-                 channels,
-                 num_classes,
-                 dropout_ratio=0.5,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 loss_decode=dict(
-                     type='CrossEntropyLoss',
-                     use_sigmoid=False,
-                     class_weight=None,
-                     loss_weight=1.0),
-                 ignore_index=255,
-                 init_cfg=None):
-        super(Base3DDecodeHead, self).__init__(init_cfg=init_cfg)
-        self.channels = channels
-        self.num_classes = num_classes
-        self.dropout_ratio = dropout_ratio
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.loss_decode = build_loss(loss_decode)
-        self.ignore_index = ignore_index
-
-        self.conv_seg = nn.Conv1d(channels, num_classes, kernel_size=1)
-        if dropout_ratio > 0:
-            self.dropout = nn.Dropout(dropout_ratio)
-        else:
-            self.dropout = None
-        self.fp16_enabled = False
-
-    def init_weights(self):
-        """Initialize weights of classification layer."""
-        super().init_weights()
-        normal_init(self.conv_seg, mean=0, std=0.01)
-
-    @auto_fp16()
-    @abstractmethod
-    def forward(self, inputs):
-        """Placeholder of forward function."""
-        pass
-
-    def forward_train(self, inputs, img_metas, pts_semantic_mask, train_cfg):
-        """Forward function for training.
-
-        Args:
-            inputs (list[torch.Tensor]): List of multi-level point features.
-            img_metas (list[dict]): Meta information of each sample.
-            pts_semantic_mask (torch.Tensor): Semantic segmentation masks
-                used if the architecture supports semantic segmentation task.
-            train_cfg (dict): The training config.
-
-        Returns:
-            dict[str, Tensor]: a dictionary of loss components
-        """
-        seg_logits = self.forward(inputs)
-        losses = self.losses(seg_logits, pts_semantic_mask)
-        return losses
-
-    def forward_test(self, inputs, img_metas, test_cfg):
-        """Forward function for testing.
-
-        Args:
-            inputs (list[Tensor]): List of multi-level point features.
-            img_metas (list[dict]): Meta information of each sample.
-            test_cfg (dict): The testing config.
-
-        Returns:
-            Tensor: Output segmentation map.
-        """
-        return self.forward(inputs)
-
-    def cls_seg(self, feat):
-        """Classify each points."""
-        if self.dropout is not None:
-            feat = self.dropout(feat)
-        output = self.conv_seg(feat)
-        return output
-
-    @force_fp32(apply_to=('seg_logit', ))
-    def losses(self, seg_logit, seg_label):
-        """Compute semantic segmentation loss.
-
-        Args:
-            seg_logit (torch.Tensor): Predicted per-point segmentation logits
-                of shape [B, num_classes, N].
-            seg_label (torch.Tensor): Ground-truth segmentation label of
-                shape [B, N].
-        """
-        loss = dict()
-        loss['loss_sem_seg'] = self.loss_decode(
-            seg_logit, seg_label, ignore_index=self.ignore_index)
-        return loss
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.cnn import normal_init
+from mmcv.runner import BaseModule, auto_fp16, force_fp32
+from torch import nn as nn
+
+from mmseg.models.builder import build_loss
+
+
+class Base3DDecodeHead(BaseModule, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+
+    Args:
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float, optional): Ratio of dropout layer. Default: 0.5.
+        conv_cfg (dict, optional): Config of conv layers.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict, optional): Config of norm layers.
+            Default: dict(type='BN1d').
+        act_cfg (dict, optional): Config of activation layers.
+            Default: dict(type='ReLU').
+        loss_decode (dict, optional): Config of decode loss.
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int, optional): The label index to be ignored.
+            When using masked BCE loss, ignore_index should be set to None.
+            Default: 255.
+    """
+
+    def __init__(self,
+                 channels,
+                 num_classes,
+                 dropout_ratio=0.5,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 act_cfg=dict(type='ReLU'),
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     class_weight=None,
+                     loss_weight=1.0),
+                 ignore_index=255,
+                 init_cfg=None):
+        super(Base3DDecodeHead, self).__init__(init_cfg=init_cfg)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.loss_decode = build_loss(loss_decode)
+        self.ignore_index = ignore_index
+
+        self.conv_seg = nn.Conv1d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+
+    def init_weights(self):
+        """Initialize weights of classification layer."""
+        super().init_weights()
+        normal_init(self.conv_seg, mean=0, std=0.01)
+
+    @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+
+    def forward_train(self, inputs, img_metas, pts_semantic_mask, train_cfg):
+        """Forward function for training.
+
+        Args:
+            inputs (list[torch.Tensor]): List of multi-level point features.
+            img_metas (list[dict]): Meta information of each sample.
+            pts_semantic_mask (torch.Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs)
+        losses = self.losses(seg_logits, pts_semantic_mask)
+        return losses
+
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing.
+
+        Args:
+            inputs (list[Tensor]): List of multi-level point features.
+            img_metas (list[dict]): Meta information of each sample.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs)
+
+    def cls_seg(self, feat):
+        """Classify each points."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+
+    @force_fp32(apply_to=('seg_logit', ))
+    def losses(self, seg_logit, seg_label):
+        """Compute semantic segmentation loss.
+
+        Args:
+            seg_logit (torch.Tensor): Predicted per-point segmentation logits
+                of shape [B, num_classes, N].
+            seg_label (torch.Tensor): Ground-truth segmentation label of
+                shape [B, N].
+        """
+        loss = dict()
+        loss['loss_sem_seg'] = self.loss_decode(
+            seg_logit, seg_label, ignore_index=self.ignore_index)
+        return loss
diff --git a/mmdet3d/models/decode_heads/dgcnn_head.py b/mmdet3d/models/decode_heads/dgcnn_head.py
index 1249b3d..8146e75 100644
--- a/mmdet3d/models/decode_heads/dgcnn_head.py
+++ b/mmdet3d/models/decode_heads/dgcnn_head.py
@@ -1,67 +1,67 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn.bricks import ConvModule
-
-from mmdet3d.ops import DGCNNFPModule
-from ..builder import HEADS
-from .decode_head import Base3DDecodeHead
-
-
-@HEADS.register_module()
-class DGCNNHead(Base3DDecodeHead):
-    r"""DGCNN decoder head.
-
-    Decoder head used in `DGCNN <https://arxiv.org/abs/1801.07829>`_.
-    Refer to the
-    `reimplementation code <https://github.com/AnTao97/dgcnn.pytorch>`_.
-
-    Args:
-        fp_channels (tuple[int], optional): Tuple of mlp channels in feature
-            propagation (FP) modules. Defaults to (1216, 512).
-    """
-
-    def __init__(self, fp_channels=(1216, 512), **kwargs):
-        super(DGCNNHead, self).__init__(**kwargs)
-
-        self.FP_module = DGCNNFPModule(
-            mlp_channels=fp_channels, act_cfg=self.act_cfg)
-
-        # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L40
-        self.pre_seg_conv = ConvModule(
-            fp_channels[-1],
-            self.channels,
-            kernel_size=1,
-            bias=False,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def _extract_input(self, feat_dict):
-        """Extract inputs from features dictionary.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-
-        Returns:
-            torch.Tensor: points for decoder.
-        """
-        fa_points = feat_dict['fa_points']
-
-        return fa_points
-
-    def forward(self, feat_dict):
-        """Forward pass.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-
-        Returns:
-            torch.Tensor: Segmentation map of shape [B, num_classes, N].
-        """
-        fa_points = self._extract_input(feat_dict)
-
-        fp_points = self.FP_module(fa_points)
-        fp_points = fp_points.transpose(1, 2).contiguous()
-        output = self.pre_seg_conv(fp_points)
-        output = self.cls_seg(output)
-
-        return output
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn.bricks import ConvModule
+
+from mmdet3d.ops import DGCNNFPModule
+from ..builder import HEADS
+from .decode_head import Base3DDecodeHead
+
+
+@HEADS.register_module()
+class DGCNNHead(Base3DDecodeHead):
+    r"""DGCNN decoder head.
+
+    Decoder head used in `DGCNN <https://arxiv.org/abs/1801.07829>`_.
+    Refer to the
+    `reimplementation code <https://github.com/AnTao97/dgcnn.pytorch>`_.
+
+    Args:
+        fp_channels (tuple[int], optional): Tuple of mlp channels in feature
+            propagation (FP) modules. Defaults to (1216, 512).
+    """
+
+    def __init__(self, fp_channels=(1216, 512), **kwargs):
+        super(DGCNNHead, self).__init__(**kwargs)
+
+        self.FP_module = DGCNNFPModule(
+            mlp_channels=fp_channels, act_cfg=self.act_cfg)
+
+        # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L40
+        self.pre_seg_conv = ConvModule(
+            fp_channels[-1],
+            self.channels,
+            kernel_size=1,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _extract_input(self, feat_dict):
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: points for decoder.
+        """
+        fa_points = feat_dict['fa_points']
+
+        return fa_points
+
+    def forward(self, feat_dict):
+        """Forward pass.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: Segmentation map of shape [B, num_classes, N].
+        """
+        fa_points = self._extract_input(feat_dict)
+
+        fp_points = self.FP_module(fa_points)
+        fp_points = fp_points.transpose(1, 2).contiguous()
+        output = self.pre_seg_conv(fp_points)
+        output = self.cls_seg(output)
+
+        return output
diff --git a/mmdet3d/models/decode_heads/paconv_head.py b/mmdet3d/models/decode_heads/paconv_head.py
index 63cc3fd..963987d 100644
--- a/mmdet3d/models/decode_heads/paconv_head.py
+++ b/mmdet3d/models/decode_heads/paconv_head.py
@@ -1,63 +1,63 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn.bricks import ConvModule
-
-from ..builder import HEADS
-from .pointnet2_head import PointNet2Head
-
-
-@HEADS.register_module()
-class PAConvHead(PointNet2Head):
-    r"""PAConv decoder head.
-
-    Decoder head used in `PAConv <https://arxiv.org/abs/2103.14635>`_.
-    Refer to the `official code <https://github.com/CVMI-Lab/PAConv>`_.
-
-    Args:
-        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
-        fp_norm_cfg (dict): Config of norm layers used in FP modules.
-            Default: dict(type='BN2d').
-    """
-
-    def __init__(self,
-                 fp_channels=((768, 256, 256), (384, 256, 256),
-                              (320, 256, 128), (128 + 6, 128, 128, 128)),
-                 fp_norm_cfg=dict(type='BN2d'),
-                 **kwargs):
-        super(PAConvHead, self).__init__(fp_channels, fp_norm_cfg, **kwargs)
-
-        # https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/pointnet2/pointnet2_paconv_seg.py#L53
-        # PointNet++'s decoder conv has bias while PAConv's doesn't have
-        # so we need to rebuild it here
-        self.pre_seg_conv = ConvModule(
-            fp_channels[-1][-1],
-            self.channels,
-            kernel_size=1,
-            bias=False,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def forward(self, feat_dict):
-        """Forward pass.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-
-        Returns:
-            torch.Tensor: Segmentation map of shape [B, num_classes, N].
-        """
-        sa_xyz, sa_features = self._extract_input(feat_dict)
-
-        # PointNet++ doesn't use the first level of `sa_features` as input
-        # while PAConv inputs it through skip-connection
-        fp_feature = sa_features[-1]
-
-        for i in range(self.num_fp):
-            # consume the points in a bottom-up manner
-            fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)],
-                                            sa_features[-(i + 2)], fp_feature)
-
-        output = self.pre_seg_conv(fp_feature)
-        output = self.cls_seg(output)
-
-        return output
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn.bricks import ConvModule
+
+from ..builder import HEADS
+from .pointnet2_head import PointNet2Head
+
+
+@HEADS.register_module()
+class PAConvHead(PointNet2Head):
+    r"""PAConv decoder head.
+
+    Decoder head used in `PAConv <https://arxiv.org/abs/2103.14635>`_.
+    Refer to the `official code <https://github.com/CVMI-Lab/PAConv>`_.
+
+    Args:
+        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
+        fp_norm_cfg (dict): Config of norm layers used in FP modules.
+            Default: dict(type='BN2d').
+    """
+
+    def __init__(self,
+                 fp_channels=((768, 256, 256), (384, 256, 256),
+                              (320, 256, 128), (128 + 6, 128, 128, 128)),
+                 fp_norm_cfg=dict(type='BN2d'),
+                 **kwargs):
+        super(PAConvHead, self).__init__(fp_channels, fp_norm_cfg, **kwargs)
+
+        # https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/pointnet2/pointnet2_paconv_seg.py#L53
+        # PointNet++'s decoder conv has bias while PAConv's doesn't have
+        # so we need to rebuild it here
+        self.pre_seg_conv = ConvModule(
+            fp_channels[-1][-1],
+            self.channels,
+            kernel_size=1,
+            bias=False,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, feat_dict):
+        """Forward pass.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: Segmentation map of shape [B, num_classes, N].
+        """
+        sa_xyz, sa_features = self._extract_input(feat_dict)
+
+        # PointNet++ doesn't use the first level of `sa_features` as input
+        # while PAConv inputs it through skip-connection
+        fp_feature = sa_features[-1]
+
+        for i in range(self.num_fp):
+            # consume the points in a bottom-up manner
+            fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)],
+                                            sa_features[-(i + 2)], fp_feature)
+
+        output = self.pre_seg_conv(fp_feature)
+        output = self.cls_seg(output)
+
+        return output
diff --git a/mmdet3d/models/decode_heads/pointnet2_head.py b/mmdet3d/models/decode_heads/pointnet2_head.py
index 28b677e..34b11ac 100644
--- a/mmdet3d/models/decode_heads/pointnet2_head.py
+++ b/mmdet3d/models/decode_heads/pointnet2_head.py
@@ -1,85 +1,85 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn.bricks import ConvModule
-from torch import nn as nn
-
-from mmdet3d.ops import PointFPModule
-from ..builder import HEADS
-from .decode_head import Base3DDecodeHead
-
-
-@HEADS.register_module()
-class PointNet2Head(Base3DDecodeHead):
-    r"""PointNet2 decoder head.
-
-    Decoder head used in `PointNet++ <https://arxiv.org/abs/1706.02413>`_.
-    Refer to the `official code <https://github.com/charlesq34/pointnet2>`_.
-
-    Args:
-        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
-        fp_norm_cfg (dict): Config of norm layers used in FP modules.
-            Default: dict(type='BN2d').
-    """
-
-    def __init__(self,
-                 fp_channels=((768, 256, 256), (384, 256, 256),
-                              (320, 256, 128), (128, 128, 128, 128)),
-                 fp_norm_cfg=dict(type='BN2d'),
-                 **kwargs):
-        super(PointNet2Head, self).__init__(**kwargs)
-
-        self.num_fp = len(fp_channels)
-        self.FP_modules = nn.ModuleList()
-        for cur_fp_mlps in fp_channels:
-            self.FP_modules.append(
-                PointFPModule(mlp_channels=cur_fp_mlps, norm_cfg=fp_norm_cfg))
-
-        # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L40
-        self.pre_seg_conv = ConvModule(
-            fp_channels[-1][-1],
-            self.channels,
-            kernel_size=1,
-            bias=True,
-            conv_cfg=self.conv_cfg,
-            norm_cfg=self.norm_cfg,
-            act_cfg=self.act_cfg)
-
-    def _extract_input(self, feat_dict):
-        """Extract inputs from features dictionary.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-
-        Returns:
-            list[torch.Tensor]: Coordinates of multiple levels of points.
-            list[torch.Tensor]: Features of multiple levels of points.
-        """
-        sa_xyz = feat_dict['sa_xyz']
-        sa_features = feat_dict['sa_features']
-        assert len(sa_xyz) == len(sa_features)
-
-        return sa_xyz, sa_features
-
-    def forward(self, feat_dict):
-        """Forward pass.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-
-        Returns:
-            torch.Tensor: Segmentation map of shape [B, num_classes, N].
-        """
-        sa_xyz, sa_features = self._extract_input(feat_dict)
-
-        # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L24
-        sa_features[0] = None
-
-        fp_feature = sa_features[-1]
-
-        for i in range(self.num_fp):
-            # consume the points in a bottom-up manner
-            fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)],
-                                            sa_features[-(i + 2)], fp_feature)
-        output = self.pre_seg_conv(fp_feature)
-        output = self.cls_seg(output)
-
-        return output
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn.bricks import ConvModule
+from torch import nn as nn
+
+from mmdet3d.ops import PointFPModule
+from ..builder import HEADS
+from .decode_head import Base3DDecodeHead
+
+
+@HEADS.register_module()
+class PointNet2Head(Base3DDecodeHead):
+    r"""PointNet2 decoder head.
+
+    Decoder head used in `PointNet++ <https://arxiv.org/abs/1706.02413>`_.
+    Refer to the `official code <https://github.com/charlesq34/pointnet2>`_.
+
+    Args:
+        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
+        fp_norm_cfg (dict): Config of norm layers used in FP modules.
+            Default: dict(type='BN2d').
+    """
+
+    def __init__(self,
+                 fp_channels=((768, 256, 256), (384, 256, 256),
+                              (320, 256, 128), (128, 128, 128, 128)),
+                 fp_norm_cfg=dict(type='BN2d'),
+                 **kwargs):
+        super(PointNet2Head, self).__init__(**kwargs)
+
+        self.num_fp = len(fp_channels)
+        self.FP_modules = nn.ModuleList()
+        for cur_fp_mlps in fp_channels:
+            self.FP_modules.append(
+                PointFPModule(mlp_channels=cur_fp_mlps, norm_cfg=fp_norm_cfg))
+
+        # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L40
+        self.pre_seg_conv = ConvModule(
+            fp_channels[-1][-1],
+            self.channels,
+            kernel_size=1,
+            bias=True,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def _extract_input(self, feat_dict):
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            list[torch.Tensor]: Coordinates of multiple levels of points.
+            list[torch.Tensor]: Features of multiple levels of points.
+        """
+        sa_xyz = feat_dict['sa_xyz']
+        sa_features = feat_dict['sa_features']
+        assert len(sa_xyz) == len(sa_features)
+
+        return sa_xyz, sa_features
+
+    def forward(self, feat_dict):
+        """Forward pass.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: Segmentation map of shape [B, num_classes, N].
+        """
+        sa_xyz, sa_features = self._extract_input(feat_dict)
+
+        # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L24
+        sa_features[0] = None
+
+        fp_feature = sa_features[-1]
+
+        for i in range(self.num_fp):
+            # consume the points in a bottom-up manner
+            fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)],
+                                            sa_features[-(i + 2)], fp_feature)
+        output = self.pre_seg_conv(fp_feature)
+        output = self.cls_seg(output)
+
+        return output
diff --git a/mmdet3d/models/dense_heads/__init__.py b/mmdet3d/models/dense_heads/__init__.py
index 25008c9..a0a1b34 100644
--- a/mmdet3d/models/dense_heads/__init__.py
+++ b/mmdet3d/models/dense_heads/__init__.py
@@ -1,25 +1,25 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .anchor3d_head import Anchor3DHead
-from .anchor_free_mono3d_head import AnchorFreeMono3DHead
-from .base_conv_bbox_head import BaseConvBboxHead
-from .base_mono3d_dense_head import BaseMono3DDenseHead
-from .centerpoint_head import CenterHead
-from .fcos_mono3d_head import FCOSMono3DHead
-from .free_anchor3d_head import FreeAnchor3DHead
-from .groupfree3d_head import GroupFree3DHead
-from .monoflex_head import MonoFlexHead
-from .parta2_rpn_head import PartA2RPNHead
-from .pgd_head import PGDHead
-from .point_rpn_head import PointRPNHead
-from .shape_aware_head import ShapeAwareHead
-from .smoke_mono3d_head import SMOKEMono3DHead
-from .ssd_3d_head import SSD3DHead
-from .vote_head import VoteHead
-
-__all__ = [
-    'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
-    'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
-    'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
-    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead',
-    'MonoFlexHead'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor3d_head import Anchor3DHead
+from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+from .base_conv_bbox_head import BaseConvBboxHead
+from .base_mono3d_dense_head import BaseMono3DDenseHead
+from .centerpoint_head import CenterHead
+from .fcos_mono3d_head import FCOSMono3DHead
+from .free_anchor3d_head import FreeAnchor3DHead
+from .groupfree3d_head import GroupFree3DHead
+from .monoflex_head import MonoFlexHead
+from .parta2_rpn_head import PartA2RPNHead
+from .pgd_head import PGDHead
+from .point_rpn_head import PointRPNHead
+from .shape_aware_head import ShapeAwareHead
+from .smoke_mono3d_head import SMOKEMono3DHead
+from .ssd_3d_head import SSD3DHead
+from .vote_head import VoteHead
+
+__all__ = [
+    'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
+    'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
+    'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
+    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead',
+    'MonoFlexHead'
+]
diff --git a/mmdet3d/models/dense_heads/anchor3d_head.py b/mmdet3d/models/dense_heads/anchor3d_head.py
index b747264..1857fb1 100644
--- a/mmdet3d/models/dense_heads/anchor3d_head.py
+++ b/mmdet3d/models/dense_heads/anchor3d_head.py
@@ -1,516 +1,516 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from mmcv.runner import BaseModule, force_fp32
-from torch import nn as nn
-
-from mmdet3d.core import (PseudoSampler, box3d_multiclass_nms, limit_period,
-                          xywhr2xyxyr)
-from mmdet.core import (build_assigner, build_bbox_coder,
-                        build_prior_generator, build_sampler, multi_apply)
-from ..builder import HEADS, build_loss
-from .train_mixins import AnchorTrainMixin
-
-
-@HEADS.register_module()
-class Anchor3DHead(BaseModule, AnchorTrainMixin):
-    """Anchor head for SECOND/PointPillars/MVXNet/PartA2.
-
-    Args:
-        num_classes (int): Number of classes.
-        in_channels (int): Number of channels in the input feature map.
-        train_cfg (dict): Train configs.
-        test_cfg (dict): Test configs.
-        feat_channels (int): Number of channels of the feature map.
-        use_direction_classifier (bool): Whether to add a direction classifier.
-        anchor_generator(dict): Config dict of anchor generator.
-        assigner_per_size (bool): Whether to do assignment for each separate
-            anchor size.
-        assign_per_class (bool): Whether to do assignment for each class.
-        diff_rad_by_sin (bool): Whether to change the difference into sin
-            difference for box regression loss.
-        dir_offset (float | int): The offset of BEV rotation angles.
-            (TODO: may be moved into box coder)
-        dir_limit_offset (float | int): The limited range of BEV
-            rotation angles. (TODO: may be moved into box coder)
-        bbox_coder (dict): Config dict of box coders.
-        loss_cls (dict): Config of classification loss.
-        loss_bbox (dict): Config of localization loss.
-        loss_dir (dict): Config of direction classifier loss.
-    """
-
-    def __init__(self,
-                 num_classes,
-                 in_channels,
-                 train_cfg,
-                 test_cfg,
-                 feat_channels=256,
-                 use_direction_classifier=True,
-                 anchor_generator=dict(
-                     type='Anchor3DRangeGenerator',
-                     range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
-                     strides=[2],
-                     sizes=[[3.9, 1.6, 1.56]],
-                     rotations=[0, 1.57],
-                     custom_values=[],
-                     reshape_out=False),
-                 assigner_per_size=False,
-                 assign_per_class=False,
-                 diff_rad_by_sin=True,
-                 dir_offset=-np.pi / 2,
-                 dir_limit_offset=0,
-                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
-                 loss_cls=dict(
-                     type='CrossEntropyLoss',
-                     use_sigmoid=True,
-                     loss_weight=1.0),
-                 loss_bbox=dict(
-                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
-                 loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.in_channels = in_channels
-        self.num_classes = num_classes
-        self.feat_channels = feat_channels
-        self.diff_rad_by_sin = diff_rad_by_sin
-        self.use_direction_classifier = use_direction_classifier
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        self.assigner_per_size = assigner_per_size
-        self.assign_per_class = assign_per_class
-        self.dir_offset = dir_offset
-        self.dir_limit_offset = dir_limit_offset
-        import warnings
-        warnings.warn(
-            'dir_offset and dir_limit_offset will be depressed and be '
-            'incorporated into box coder in the future')
-        self.fp16_enabled = False
-
-        # build anchor generator
-        self.anchor_generator = build_prior_generator(anchor_generator)
-        # In 3D detection, the anchor stride is connected with anchor size
-        self.num_anchors = self.anchor_generator.num_base_anchors
-        # build box coder
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-        self.box_code_size = self.bbox_coder.code_size
-
-        # build loss function
-        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
-        self.sampling = loss_cls['type'] not in ['FocalLoss', 'GHMC']
-        if not self.use_sigmoid_cls:
-            self.num_classes += 1
-        self.loss_cls = build_loss(loss_cls)
-        self.loss_bbox = build_loss(loss_bbox)
-        self.loss_dir = build_loss(loss_dir)
-        self.fp16_enabled = False
-
-        self._init_layers()
-        self._init_assigner_sampler()
-
-        if init_cfg is None:
-            self.init_cfg = dict(
-                type='Normal',
-                layer='Conv2d',
-                std=0.01,
-                override=dict(
-                    type='Normal', name='conv_cls', std=0.01, bias_prob=0.01))
-
-    def _init_assigner_sampler(self):
-        """Initialize the target assigner and sampler of the head."""
-        if self.train_cfg is None:
-            return
-
-        if self.sampling:
-            self.bbox_sampler = build_sampler(self.train_cfg.sampler)
-        else:
-            self.bbox_sampler = PseudoSampler()
-        if isinstance(self.train_cfg.assigner, dict):
-            self.bbox_assigner = build_assigner(self.train_cfg.assigner)
-        elif isinstance(self.train_cfg.assigner, list):
-            self.bbox_assigner = [
-                build_assigner(res) for res in self.train_cfg.assigner
-            ]
-
-    def _init_layers(self):
-        """Initialize neural network layers of the head."""
-        self.cls_out_channels = self.num_anchors * self.num_classes
-        self.conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1)
-        self.conv_reg = nn.Conv2d(self.feat_channels,
-                                  self.num_anchors * self.box_code_size, 1)
-        if self.use_direction_classifier:
-            self.conv_dir_cls = nn.Conv2d(self.feat_channels,
-                                          self.num_anchors * 2, 1)
-
-    def forward_single(self, x):
-        """Forward function on a single-scale feature map.
-
-        Args:
-            x (torch.Tensor): Input features.
-
-        Returns:
-            tuple[torch.Tensor]: Contain score of each class, bbox
-                regression and direction classification predictions.
-        """
-        cls_score = self.conv_cls(x)
-        bbox_pred = self.conv_reg(x)
-        dir_cls_preds = None
-        if self.use_direction_classifier:
-            dir_cls_preds = self.conv_dir_cls(x)
-        return cls_score, bbox_pred, dir_cls_preds
-
-    def forward(self, feats):
-        """Forward pass.
-
-        Args:
-            feats (list[torch.Tensor]): Multi-level features, e.g.,
-                features produced by FPN.
-
-        Returns:
-            tuple[list[torch.Tensor]]: Multi-level class score, bbox
-                and direction predictions.
-        """
-        return multi_apply(self.forward_single, feats)
-
-    def get_anchors(self, featmap_sizes, input_metas, device='cuda'):
-        """Get anchors according to feature map sizes.
-
-        Args:
-            featmap_sizes (list[tuple]): Multi-level feature map sizes.
-            input_metas (list[dict]): contain pcd and img's meta info.
-            device (str): device of current module.
-
-        Returns:
-            list[list[torch.Tensor]]: Anchors of each image, valid flags
-                of each image.
-        """
-        num_imgs = len(input_metas)
-        # since feature map sizes of all images are the same, we only compute
-        # anchors for one time
-        multi_level_anchors = self.anchor_generator.grid_anchors(
-            featmap_sizes, device=device)
-        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
-        return anchor_list
-
-    def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels,
-                    label_weights, bbox_targets, bbox_weights, dir_targets,
-                    dir_weights, num_total_samples):
-        """Calculate loss of Single-level results.
-
-        Args:
-            cls_score (torch.Tensor): Class score in single-level.
-            bbox_pred (torch.Tensor): Bbox prediction in single-level.
-            dir_cls_preds (torch.Tensor): Predictions of direction class
-                in single-level.
-            labels (torch.Tensor): Labels of class.
-            label_weights (torch.Tensor): Weights of class loss.
-            bbox_targets (torch.Tensor): Targets of bbox predictions.
-            bbox_weights (torch.Tensor): Weights of bbox loss.
-            dir_targets (torch.Tensor): Targets of direction predictions.
-            dir_weights (torch.Tensor): Weights of direction loss.
-            num_total_samples (int): The number of valid samples.
-
-        Returns:
-            tuple[torch.Tensor]: Losses of class, bbox
-                and direction, respectively.
-        """
-        # classification loss
-        if num_total_samples is None:
-            num_total_samples = int(cls_score.shape[0])
-        labels = labels.reshape(-1)
-        label_weights = label_weights.reshape(-1)
-        cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
-        assert labels.max().item() <= self.num_classes
-        loss_cls = self.loss_cls(
-            cls_score, labels, label_weights, avg_factor=num_total_samples)
-
-        # regression loss
-        bbox_pred = bbox_pred.permute(0, 2, 3,
-                                      1).reshape(-1, self.box_code_size)
-        bbox_targets = bbox_targets.reshape(-1, self.box_code_size)
-        bbox_weights = bbox_weights.reshape(-1, self.box_code_size)
-
-        bg_class_ind = self.num_classes
-        pos_inds = ((labels >= 0)
-                    & (labels < bg_class_ind)).nonzero(
-                        as_tuple=False).reshape(-1)
-        num_pos = len(pos_inds)
-
-        pos_bbox_pred = bbox_pred[pos_inds]
-        pos_bbox_targets = bbox_targets[pos_inds]
-        pos_bbox_weights = bbox_weights[pos_inds]
-
-        # dir loss
-        if self.use_direction_classifier:
-            dir_cls_preds = dir_cls_preds.permute(0, 2, 3, 1).reshape(-1, 2)
-            dir_targets = dir_targets.reshape(-1)
-            dir_weights = dir_weights.reshape(-1)
-            pos_dir_cls_preds = dir_cls_preds[pos_inds]
-            pos_dir_targets = dir_targets[pos_inds]
-            pos_dir_weights = dir_weights[pos_inds]
-
-        if num_pos > 0:
-            code_weight = self.train_cfg.get('code_weight', None)
-            if code_weight:
-                pos_bbox_weights = pos_bbox_weights * bbox_weights.new_tensor(
-                    code_weight)
-            if self.diff_rad_by_sin:
-                pos_bbox_pred, pos_bbox_targets = self.add_sin_difference(
-                    pos_bbox_pred, pos_bbox_targets)
-            loss_bbox = self.loss_bbox(
-                pos_bbox_pred,
-                pos_bbox_targets,
-                pos_bbox_weights,
-                avg_factor=num_total_samples)
-
-            # direction classification loss
-            loss_dir = None
-            if self.use_direction_classifier:
-                loss_dir = self.loss_dir(
-                    pos_dir_cls_preds,
-                    pos_dir_targets,
-                    pos_dir_weights,
-                    avg_factor=num_total_samples)
-        else:
-            loss_bbox = pos_bbox_pred.sum()
-            if self.use_direction_classifier:
-                loss_dir = pos_dir_cls_preds.sum()
-
-        return loss_cls, loss_bbox, loss_dir
-
-    @staticmethod
-    def add_sin_difference(boxes1, boxes2):
-        """Convert the rotation difference to difference in sine function.
-
-        Args:
-            boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7
-                and the 7th dimension is rotation dimension.
-            boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and
-                the 7th dimension is rotation dimension.
-
-        Returns:
-            tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th
-                dimensions are changed.
-        """
-        rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
-            boxes2[..., 6:7])
-        rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[...,
-                                                                         6:7])
-        boxes1 = torch.cat(
-            [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1)
-        boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]],
-                           dim=-1)
-        return boxes1, boxes2
-
-    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             dir_cls_preds,
-             gt_bboxes,
-             gt_labels,
-             input_metas,
-             gt_bboxes_ignore=None):
-        """Calculate losses.
-
-        Args:
-            cls_scores (list[torch.Tensor]): Multi-level class scores.
-            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
-            dir_cls_preds (list[torch.Tensor]): Multi-level direction
-                class predictions.
-            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Gt bboxes
-                of each sample.
-            gt_labels (list[torch.Tensor]): Gt labels of each sample.
-            input_metas (list[dict]): Contain pcd and img's meta info.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding boxes to ignore.
-
-        Returns:
-            dict[str, list[torch.Tensor]]: Classification, bbox, and
-                direction losses of each level.
-
-                - loss_cls (list[torch.Tensor]): Classification losses.
-                - loss_bbox (list[torch.Tensor]): Box regression losses.
-                - loss_dir (list[torch.Tensor]): Direction classification
-                    losses.
-        """
-        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
-        assert len(featmap_sizes) == self.anchor_generator.num_levels
-        device = cls_scores[0].device
-        anchor_list = self.get_anchors(
-            featmap_sizes, input_metas, device=device)
-        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
-        cls_reg_targets = self.anchor_target_3d(
-            anchor_list,
-            gt_bboxes,
-            input_metas,
-            gt_bboxes_ignore_list=gt_bboxes_ignore,
-            gt_labels_list=gt_labels,
-            num_classes=self.num_classes,
-            label_channels=label_channels,
-            sampling=self.sampling)
-
-        if cls_reg_targets is None:
-            return None
-        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
-         dir_targets_list, dir_weights_list, num_total_pos,
-         num_total_neg) = cls_reg_targets
-        num_total_samples = (
-            num_total_pos + num_total_neg if self.sampling else num_total_pos)
-
-        # num_total_samples = None
-        losses_cls, losses_bbox, losses_dir = multi_apply(
-            self.loss_single,
-            cls_scores,
-            bbox_preds,
-            dir_cls_preds,
-            labels_list,
-            label_weights_list,
-            bbox_targets_list,
-            bbox_weights_list,
-            dir_targets_list,
-            dir_weights_list,
-            num_total_samples=num_total_samples)
-        return dict(
-            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir)
-
-    def get_bboxes(self,
-                   cls_scores,
-                   bbox_preds,
-                   dir_cls_preds,
-                   input_metas,
-                   cfg=None,
-                   rescale=False):
-        """Get bboxes of anchor head.
-
-        Args:
-            cls_scores (list[torch.Tensor]): Multi-level class scores.
-            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
-            dir_cls_preds (list[torch.Tensor]): Multi-level direction
-                class predictions.
-            input_metas (list[dict]): Contain pcd and img's meta info.
-            cfg (:obj:`ConfigDict`): Training or testing config.
-            rescale (list[torch.Tensor]): Whether th rescale bbox.
-
-        Returns:
-            list[tuple]: Prediction resultes of batches.
-        """
-        assert len(cls_scores) == len(bbox_preds)
-        assert len(cls_scores) == len(dir_cls_preds)
-        num_levels = len(cls_scores)
-        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
-        device = cls_scores[0].device
-        mlvl_anchors = self.anchor_generator.grid_anchors(
-            featmap_sizes, device=device)
-        mlvl_anchors = [
-            anchor.reshape(-1, self.box_code_size) for anchor in mlvl_anchors
-        ]
-
-        result_list = []
-        for img_id in range(len(input_metas)):
-            cls_score_list = [
-                cls_scores[i][img_id].detach() for i in range(num_levels)
-            ]
-            bbox_pred_list = [
-                bbox_preds[i][img_id].detach() for i in range(num_levels)
-            ]
-            dir_cls_pred_list = [
-                dir_cls_preds[i][img_id].detach() for i in range(num_levels)
-            ]
-
-            input_meta = input_metas[img_id]
-            proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list,
-                                               dir_cls_pred_list, mlvl_anchors,
-                                               input_meta, cfg, rescale)
-            result_list.append(proposals)
-        return result_list
-
-    def get_bboxes_single(self,
-                          cls_scores,
-                          bbox_preds,
-                          dir_cls_preds,
-                          mlvl_anchors,
-                          input_meta,
-                          cfg=None,
-                          rescale=False):
-        """Get bboxes of single branch.
-
-        Args:
-            cls_scores (torch.Tensor): Class score in single batch.
-            bbox_preds (torch.Tensor): Bbox prediction in single batch.
-            dir_cls_preds (torch.Tensor): Predictions of direction class
-                in single batch.
-            mlvl_anchors (List[torch.Tensor]): Multi-level anchors
-                in single batch.
-            input_meta (list[dict]): Contain pcd and img's meta info.
-            cfg (:obj:`ConfigDict`): Training or testing config.
-            rescale (list[torch.Tensor]): whether th rescale bbox.
-
-        Returns:
-            tuple: Contain predictions of single batch.
-
-                - bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
-                - scores (torch.Tensor): Class score of each bbox.
-                - labels (torch.Tensor): Label of each bbox.
-        """
-        cfg = self.test_cfg if cfg is None else cfg
-        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
-        mlvl_bboxes = []
-        mlvl_scores = []
-        mlvl_dir_scores = []
-        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
-                cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
-            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
-            assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
-            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
-            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
-
-            cls_score = cls_score.permute(1, 2,
-                                          0).reshape(-1, self.num_classes)
-            if self.use_sigmoid_cls:
-                scores = cls_score.sigmoid()
-            else:
-                scores = cls_score.softmax(-1)
-            bbox_pred = bbox_pred.permute(1, 2,
-                                          0).reshape(-1, self.box_code_size)
-
-            nms_pre = cfg.get('nms_pre', -1)
-            if nms_pre > 0 and scores.shape[0] > nms_pre:
-                if self.use_sigmoid_cls:
-                    max_scores, _ = scores.max(dim=1)
-                else:
-                    max_scores, _ = scores[:, :-1].max(dim=1)
-                _, topk_inds = max_scores.topk(nms_pre)
-                anchors = anchors[topk_inds, :]
-                bbox_pred = bbox_pred[topk_inds, :]
-                scores = scores[topk_inds, :]
-                dir_cls_score = dir_cls_score[topk_inds]
-
-            bboxes = self.bbox_coder.decode(anchors, bbox_pred)
-            mlvl_bboxes.append(bboxes)
-            mlvl_scores.append(scores)
-            mlvl_dir_scores.append(dir_cls_score)
-
-        mlvl_bboxes = torch.cat(mlvl_bboxes)
-        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
-            mlvl_bboxes, box_dim=self.box_code_size).bev)
-        mlvl_scores = torch.cat(mlvl_scores)
-        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
-
-        if self.use_sigmoid_cls:
-            # Add a dummy background class to the front when using sigmoid
-            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
-            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
-
-        score_thr = cfg.get('score_thr', 0)
-        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
-                                       mlvl_scores, score_thr, cfg.max_num,
-                                       cfg, mlvl_dir_scores)
-        bboxes, scores, labels, dir_scores = results
-        if bboxes.shape[0] > 0:
-            dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
-                                   self.dir_limit_offset, np.pi)
-            bboxes[..., 6] = (
-                dir_rot + self.dir_offset +
-                np.pi * dir_scores.to(bboxes.dtype))
-        bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
-        return bboxes, scores, labels
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.runner import BaseModule, force_fp32
+from torch import nn as nn
+
+from mmdet3d.core import (PseudoSampler, box3d_multiclass_nms, limit_period,
+                          xywhr2xyxyr)
+from mmdet.core import (build_assigner, build_bbox_coder,
+                        build_prior_generator, build_sampler, multi_apply)
+from ..builder import HEADS, build_loss
+from .train_mixins import AnchorTrainMixin
+
+
+@HEADS.register_module()
+class Anchor3DHead(BaseModule, AnchorTrainMixin):
+    """Anchor head for SECOND/PointPillars/MVXNet/PartA2.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        train_cfg (dict): Train configs.
+        test_cfg (dict): Test configs.
+        feat_channels (int): Number of channels of the feature map.
+        use_direction_classifier (bool): Whether to add a direction classifier.
+        anchor_generator(dict): Config dict of anchor generator.
+        assigner_per_size (bool): Whether to do assignment for each separate
+            anchor size.
+        assign_per_class (bool): Whether to do assignment for each class.
+        diff_rad_by_sin (bool): Whether to change the difference into sin
+            difference for box regression loss.
+        dir_offset (float | int): The offset of BEV rotation angles.
+            (TODO: may be moved into box coder)
+        dir_limit_offset (float | int): The limited range of BEV
+            rotation angles. (TODO: may be moved into box coder)
+        bbox_coder (dict): Config dict of box coders.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        loss_dir (dict): Config of direction classifier loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 train_cfg,
+                 test_cfg,
+                 feat_channels=256,
+                 use_direction_classifier=True,
+                 anchor_generator=dict(
+                     type='Anchor3DRangeGenerator',
+                     range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
+                     strides=[2],
+                     sizes=[[3.9, 1.6, 1.56]],
+                     rotations=[0, 1.57],
+                     custom_values=[],
+                     reshape_out=False),
+                 assigner_per_size=False,
+                 assign_per_class=False,
+                 diff_rad_by_sin=True,
+                 dir_offset=-np.pi / 2,
+                 dir_limit_offset=0,
+                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+                 loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.diff_rad_by_sin = diff_rad_by_sin
+        self.use_direction_classifier = use_direction_classifier
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.assigner_per_size = assigner_per_size
+        self.assign_per_class = assign_per_class
+        self.dir_offset = dir_offset
+        self.dir_limit_offset = dir_limit_offset
+        import warnings
+        warnings.warn(
+            'dir_offset and dir_limit_offset will be depressed and be '
+            'incorporated into box coder in the future')
+        self.fp16_enabled = False
+
+        # build anchor generator
+        self.anchor_generator = build_prior_generator(anchor_generator)
+        # In 3D detection, the anchor stride is connected with anchor size
+        self.num_anchors = self.anchor_generator.num_base_anchors
+        # build box coder
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.box_code_size = self.bbox_coder.code_size
+
+        # build loss function
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.sampling = loss_cls['type'] not in ['FocalLoss', 'GHMC']
+        if not self.use_sigmoid_cls:
+            self.num_classes += 1
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_dir = build_loss(loss_dir)
+        self.fp16_enabled = False
+
+        self._init_layers()
+        self._init_assigner_sampler()
+
+        if init_cfg is None:
+            self.init_cfg = dict(
+                type='Normal',
+                layer='Conv2d',
+                std=0.01,
+                override=dict(
+                    type='Normal', name='conv_cls', std=0.01, bias_prob=0.01))
+
+    def _init_assigner_sampler(self):
+        """Initialize the target assigner and sampler of the head."""
+        if self.train_cfg is None:
+            return
+
+        if self.sampling:
+            self.bbox_sampler = build_sampler(self.train_cfg.sampler)
+        else:
+            self.bbox_sampler = PseudoSampler()
+        if isinstance(self.train_cfg.assigner, dict):
+            self.bbox_assigner = build_assigner(self.train_cfg.assigner)
+        elif isinstance(self.train_cfg.assigner, list):
+            self.bbox_assigner = [
+                build_assigner(res) for res in self.train_cfg.assigner
+            ]
+
+    def _init_layers(self):
+        """Initialize neural network layers of the head."""
+        self.cls_out_channels = self.num_anchors * self.num_classes
+        self.conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1)
+        self.conv_reg = nn.Conv2d(self.feat_channels,
+                                  self.num_anchors * self.box_code_size, 1)
+        if self.use_direction_classifier:
+            self.conv_dir_cls = nn.Conv2d(self.feat_channels,
+                                          self.num_anchors * 2, 1)
+
+    def forward_single(self, x):
+        """Forward function on a single-scale feature map.
+
+        Args:
+            x (torch.Tensor): Input features.
+
+        Returns:
+            tuple[torch.Tensor]: Contain score of each class, bbox
+                regression and direction classification predictions.
+        """
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        dir_cls_preds = None
+        if self.use_direction_classifier:
+            dir_cls_preds = self.conv_dir_cls(x)
+        return cls_score, bbox_pred, dir_cls_preds
+
+    def forward(self, feats):
+        """Forward pass.
+
+        Args:
+            feats (list[torch.Tensor]): Multi-level features, e.g.,
+                features produced by FPN.
+
+        Returns:
+            tuple[list[torch.Tensor]]: Multi-level class score, bbox
+                and direction predictions.
+        """
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(self, featmap_sizes, input_metas, device='cuda'):
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            input_metas (list[dict]): contain pcd and img's meta info.
+            device (str): device of current module.
+
+        Returns:
+            list[list[torch.Tensor]]: Anchors of each image, valid flags
+                of each image.
+        """
+        num_imgs = len(input_metas)
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+        return anchor_list
+
+    def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels,
+                    label_weights, bbox_targets, bbox_weights, dir_targets,
+                    dir_weights, num_total_samples):
+        """Calculate loss of Single-level results.
+
+        Args:
+            cls_score (torch.Tensor): Class score in single-level.
+            bbox_pred (torch.Tensor): Bbox prediction in single-level.
+            dir_cls_preds (torch.Tensor): Predictions of direction class
+                in single-level.
+            labels (torch.Tensor): Labels of class.
+            label_weights (torch.Tensor): Weights of class loss.
+            bbox_targets (torch.Tensor): Targets of bbox predictions.
+            bbox_weights (torch.Tensor): Weights of bbox loss.
+            dir_targets (torch.Tensor): Targets of direction predictions.
+            dir_weights (torch.Tensor): Weights of direction loss.
+            num_total_samples (int): The number of valid samples.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of class, bbox
+                and direction, respectively.
+        """
+        # classification loss
+        if num_total_samples is None:
+            num_total_samples = int(cls_score.shape[0])
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
+        assert labels.max().item() <= self.num_classes
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+
+        # regression loss
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, self.box_code_size)
+        bbox_targets = bbox_targets.reshape(-1, self.box_code_size)
+        bbox_weights = bbox_weights.reshape(-1, self.box_code_size)
+
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero(
+                        as_tuple=False).reshape(-1)
+        num_pos = len(pos_inds)
+
+        pos_bbox_pred = bbox_pred[pos_inds]
+        pos_bbox_targets = bbox_targets[pos_inds]
+        pos_bbox_weights = bbox_weights[pos_inds]
+
+        # dir loss
+        if self.use_direction_classifier:
+            dir_cls_preds = dir_cls_preds.permute(0, 2, 3, 1).reshape(-1, 2)
+            dir_targets = dir_targets.reshape(-1)
+            dir_weights = dir_weights.reshape(-1)
+            pos_dir_cls_preds = dir_cls_preds[pos_inds]
+            pos_dir_targets = dir_targets[pos_inds]
+            pos_dir_weights = dir_weights[pos_inds]
+
+        if num_pos > 0:
+            code_weight = self.train_cfg.get('code_weight', None)
+            if code_weight:
+                pos_bbox_weights = pos_bbox_weights * bbox_weights.new_tensor(
+                    code_weight)
+            if self.diff_rad_by_sin:
+                pos_bbox_pred, pos_bbox_targets = self.add_sin_difference(
+                    pos_bbox_pred, pos_bbox_targets)
+            loss_bbox = self.loss_bbox(
+                pos_bbox_pred,
+                pos_bbox_targets,
+                pos_bbox_weights,
+                avg_factor=num_total_samples)
+
+            # direction classification loss
+            loss_dir = None
+            if self.use_direction_classifier:
+                loss_dir = self.loss_dir(
+                    pos_dir_cls_preds,
+                    pos_dir_targets,
+                    pos_dir_weights,
+                    avg_factor=num_total_samples)
+        else:
+            loss_bbox = pos_bbox_pred.sum()
+            if self.use_direction_classifier:
+                loss_dir = pos_dir_cls_preds.sum()
+
+        return loss_cls, loss_bbox, loss_dir
+
+    @staticmethod
+    def add_sin_difference(boxes1, boxes2):
+        """Convert the rotation difference to difference in sine function.
+
+        Args:
+            boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7
+                and the 7th dimension is rotation dimension.
+            boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and
+                the 7th dimension is rotation dimension.
+
+        Returns:
+            tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th
+                dimensions are changed.
+        """
+        rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
+            boxes2[..., 6:7])
+        rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[...,
+                                                                         6:7])
+        boxes1 = torch.cat(
+            [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1)
+        boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]],
+                           dim=-1)
+        return boxes1, boxes2
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             dir_cls_preds,
+             gt_bboxes,
+             gt_labels,
+             input_metas,
+             gt_bboxes_ignore=None):
+        """Calculate losses.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Multi-level class scores.
+            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
+            dir_cls_preds (list[torch.Tensor]): Multi-level direction
+                class predictions.
+            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Gt bboxes
+                of each sample.
+            gt_labels (list[torch.Tensor]): Gt labels of each sample.
+            input_metas (list[dict]): Contain pcd and img's meta info.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding boxes to ignore.
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Classification, bbox, and
+                direction losses of each level.
+
+                - loss_cls (list[torch.Tensor]): Classification losses.
+                - loss_bbox (list[torch.Tensor]): Box regression losses.
+                - loss_dir (list[torch.Tensor]): Direction classification
+                    losses.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+        device = cls_scores[0].device
+        anchor_list = self.get_anchors(
+            featmap_sizes, input_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.anchor_target_3d(
+            anchor_list,
+            gt_bboxes,
+            input_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            num_classes=self.num_classes,
+            label_channels=label_channels,
+            sampling=self.sampling)
+
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         dir_targets_list, dir_weights_list, num_total_pos,
+         num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # num_total_samples = None
+        losses_cls, losses_bbox, losses_dir = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            dir_cls_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            dir_targets_list,
+            dir_weights_list,
+            num_total_samples=num_total_samples)
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir)
+
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   dir_cls_preds,
+                   input_metas,
+                   cfg=None,
+                   rescale=False):
+        """Get bboxes of anchor head.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Multi-level class scores.
+            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
+            dir_cls_preds (list[torch.Tensor]): Multi-level direction
+                class predictions.
+            input_metas (list[dict]): Contain pcd and img's meta info.
+            cfg (:obj:`ConfigDict`): Training or testing config.
+            rescale (list[torch.Tensor]): Whether th rescale bbox.
+
+        Returns:
+            list[tuple]: Prediction resultes of batches.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        assert len(cls_scores) == len(dir_cls_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        device = cls_scores[0].device
+        mlvl_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+        mlvl_anchors = [
+            anchor.reshape(-1, self.box_code_size) for anchor in mlvl_anchors
+        ]
+
+        result_list = []
+        for img_id in range(len(input_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            dir_cls_pred_list = [
+                dir_cls_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+
+            input_meta = input_metas[img_id]
+            proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list,
+                                               dir_cls_pred_list, mlvl_anchors,
+                                               input_meta, cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def get_bboxes_single(self,
+                          cls_scores,
+                          bbox_preds,
+                          dir_cls_preds,
+                          mlvl_anchors,
+                          input_meta,
+                          cfg=None,
+                          rescale=False):
+        """Get bboxes of single branch.
+
+        Args:
+            cls_scores (torch.Tensor): Class score in single batch.
+            bbox_preds (torch.Tensor): Bbox prediction in single batch.
+            dir_cls_preds (torch.Tensor): Predictions of direction class
+                in single batch.
+            mlvl_anchors (List[torch.Tensor]): Multi-level anchors
+                in single batch.
+            input_meta (list[dict]): Contain pcd and img's meta info.
+            cfg (:obj:`ConfigDict`): Training or testing config.
+            rescale (list[torch.Tensor]): whether th rescale bbox.
+
+        Returns:
+            tuple: Contain predictions of single batch.
+
+                - bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
+                - scores (torch.Tensor): Class score of each bbox.
+                - labels (torch.Tensor): Label of each bbox.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
+                cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.num_classes)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1, self.box_code_size)
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_score = dir_cls_score[topk_inds]
+
+            bboxes = self.bbox_coder.decode(anchors, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            mlvl_bboxes, box_dim=self.box_code_size).bev)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the front when using sigmoid
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+
+        score_thr = cfg.get('score_thr', 0)
+        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                       mlvl_scores, score_thr, cfg.max_num,
+                                       cfg, mlvl_dir_scores)
+        bboxes, scores, labels, dir_scores = results
+        if bboxes.shape[0] > 0:
+            dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
+                                   self.dir_limit_offset, np.pi)
+            bboxes[..., 6] = (
+                dir_rot + self.dir_offset +
+                np.pi * dir_scores.to(bboxes.dtype))
+        bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
+        return bboxes, scores, labels
diff --git a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py
index e9b27d0..c0932af 100644
--- a/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py
+++ b/mmdet3d/models/dense_heads/anchor_free_mono3d_head.py
@@ -1,534 +1,534 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import abstractmethod
-
-import torch
-from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
-from mmcv.runner import force_fp32
-from torch import nn as nn
-
-from mmdet.core import multi_apply
-from ..builder import HEADS, build_loss
-from .base_mono3d_dense_head import BaseMono3DDenseHead
-
-
-@HEADS.register_module()
-class AnchorFreeMono3DHead(BaseMono3DDenseHead):
-    """Anchor-free head for monocular 3D object detection.
-
-    Args:
-        num_classes (int): Number of categories excluding the background
-            category.
-        in_channels (int): Number of channels in the input feature map.
-        feat_channels (int, optional): Number of hidden channels.
-            Used in child classes. Defaults to 256.
-        stacked_convs (int, optional): Number of stacking convs of the head.
-        strides (tuple, optional): Downsample factor of each feature map.
-        dcn_on_last_conv (bool, optional): If true, use dcn in the last
-            layer of towers. Default: False.
-        conv_bias (bool | str, optional): If specified as `auto`, it will be
-            decided by the norm_cfg. Bias of conv will be set as True
-            if `norm_cfg` is None, otherwise False. Default: 'auto'.
-        background_label (int, optional): Label ID of background,
-            set as 0 for RPN and num_classes for other heads.
-            It will automatically set as `num_classes` if None is given.
-        use_direction_classifier (bool, optional):
-            Whether to add a direction classifier.
-        diff_rad_by_sin (bool, optional): Whether to change the difference
-            into sin difference for box regression loss. Defaults to True.
-        dir_offset (float, optional): Parameter used in direction
-            classification. Defaults to 0.
-        dir_limit_offset (float, optional): Parameter used in direction
-            classification. Defaults to 0.
-        loss_cls (dict, optional): Config of classification loss.
-        loss_bbox (dict, optional): Config of localization loss.
-        loss_dir (dict, optional): Config of direction classifier loss.
-        loss_attr (dict, optional): Config of attribute classifier loss,
-            which is only active when `pred_attrs=True`.
-        bbox_code_size (int, optional): Dimensions of predicted bounding boxes.
-        pred_attrs (bool, optional): Whether to predict attributes.
-            Defaults to False.
-        num_attrs (int, optional): The number of attributes to be predicted.
-            Default: 9.
-        pred_velo (bool, optional): Whether to predict velocity.
-            Defaults to False.
-        pred_bbox2d (bool, optional): Whether to predict 2D boxes.
-            Defaults to False.
-        group_reg_dims (tuple[int], optional): The dimension of each regression
-            target group. Default: (2, 1, 3, 1, 2).
-        cls_branch (tuple[int], optional): Channels for classification branch.
-            Default: (128, 64).
-        reg_branch (tuple[tuple], optional): Channels for regression branch.
-            Default: (
-                (128, 64),  # offset
-                (128, 64),  # depth
-                (64, ),  # size
-                (64, ),  # rot
-                ()  # velo
-            ),
-        dir_branch (tuple[int], optional): Channels for direction
-            classification branch. Default: (64, ).
-        attr_branch (tuple[int], optional): Channels for classification branch.
-            Default: (64, ).
-        conv_cfg (dict, optional): Config dict for convolution layer.
-            Default: None.
-        norm_cfg (dict, optional): Config dict for normalization layer.
-            Default: None.
-        train_cfg (dict, optional): Training config of anchor head.
-        test_cfg (dict, optional): Testing config of anchor head.
-    """  # noqa: W605
-
-    _version = 1
-
-    def __init__(
-            self,
-            num_classes,
-            in_channels,
-            feat_channels=256,
-            stacked_convs=4,
-            strides=(4, 8, 16, 32, 64),
-            dcn_on_last_conv=False,
-            conv_bias='auto',
-            background_label=None,
-            use_direction_classifier=True,
-            diff_rad_by_sin=True,
-            dir_offset=0,
-            dir_limit_offset=0,
-            loss_cls=dict(
-                type='FocalLoss',
-                use_sigmoid=True,
-                gamma=2.0,
-                alpha=0.25,
-                loss_weight=1.0),
-            loss_bbox=dict(
-                type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
-            loss_dir=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-            loss_attr=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-            bbox_code_size=9,  # For nuscenes
-            pred_attrs=False,
-            num_attrs=9,  # For nuscenes
-            pred_velo=False,
-            pred_bbox2d=False,
-            group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo,
-            cls_branch=(128, 64),
-            reg_branch=(
-                (128, 64),  # offset
-                (128, 64),  # depth
-                (64, ),  # size
-                (64, ),  # rot
-                ()  # velo
-            ),
-            dir_branch=(64, ),
-            attr_branch=(64, ),
-            conv_cfg=None,
-            norm_cfg=None,
-            train_cfg=None,
-            test_cfg=None,
-            init_cfg=None):
-        super(AnchorFreeMono3DHead, self).__init__(init_cfg=init_cfg)
-        self.num_classes = num_classes
-        self.cls_out_channels = num_classes
-        self.in_channels = in_channels
-        self.feat_channels = feat_channels
-        self.stacked_convs = stacked_convs
-        self.strides = strides
-        self.dcn_on_last_conv = dcn_on_last_conv
-        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
-        self.conv_bias = conv_bias
-        self.use_direction_classifier = use_direction_classifier
-        self.diff_rad_by_sin = diff_rad_by_sin
-        self.dir_offset = dir_offset
-        self.dir_limit_offset = dir_limit_offset
-        self.loss_cls = build_loss(loss_cls)
-        self.loss_bbox = build_loss(loss_bbox)
-        self.loss_dir = build_loss(loss_dir)
-        self.bbox_code_size = bbox_code_size
-        self.group_reg_dims = list(group_reg_dims)
-        self.cls_branch = cls_branch
-        self.reg_branch = reg_branch
-        assert len(reg_branch) == len(group_reg_dims), 'The number of '\
-            'element in reg_branch and group_reg_dims should be the same.'
-        self.pred_velo = pred_velo
-        self.pred_bbox2d = pred_bbox2d
-        self.out_channels = []
-        for reg_branch_channels in reg_branch:
-            if len(reg_branch_channels) > 0:
-                self.out_channels.append(reg_branch_channels[-1])
-            else:
-                self.out_channels.append(-1)
-        self.dir_branch = dir_branch
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.fp16_enabled = False
-        self.background_label = (
-            num_classes if background_label is None else background_label)
-        # background_label should be either 0 or num_classes
-        assert (self.background_label == 0
-                or self.background_label == num_classes)
-        self.pred_attrs = pred_attrs
-        self.attr_background_label = -1
-        self.num_attrs = num_attrs
-        if self.pred_attrs:
-            self.attr_background_label = num_attrs
-            self.loss_attr = build_loss(loss_attr)
-            self.attr_branch = attr_branch
-
-        self._init_layers()
-
-    def _init_layers(self):
-        """Initialize layers of the head."""
-        self._init_cls_convs()
-        self._init_reg_convs()
-        self._init_predictor()
-
-    def _init_cls_convs(self):
-        """Initialize classification conv layers of the head."""
-        self.cls_convs = nn.ModuleList()
-        for i in range(self.stacked_convs):
-            chn = self.in_channels if i == 0 else self.feat_channels
-            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
-                conv_cfg = dict(type='DCNv2')
-            else:
-                conv_cfg = self.conv_cfg
-            self.cls_convs.append(
-                ConvModule(
-                    chn,
-                    self.feat_channels,
-                    3,
-                    stride=1,
-                    padding=1,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    bias=self.conv_bias))
-
-    def _init_reg_convs(self):
-        """Initialize bbox regression conv layers of the head."""
-        self.reg_convs = nn.ModuleList()
-        for i in range(self.stacked_convs):
-            chn = self.in_channels if i == 0 else self.feat_channels
-            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
-                conv_cfg = dict(type='DCNv2')
-            else:
-                conv_cfg = self.conv_cfg
-            self.reg_convs.append(
-                ConvModule(
-                    chn,
-                    self.feat_channels,
-                    3,
-                    stride=1,
-                    padding=1,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    bias=self.conv_bias))
-
-    def _init_branch(self, conv_channels=(64), conv_strides=(1)):
-        """Initialize conv layers as a prediction branch."""
-        conv_before_pred = nn.ModuleList()
-        if isinstance(conv_channels, int):
-            conv_channels = [self.feat_channels] + [conv_channels]
-            conv_strides = [conv_strides]
-        else:
-            conv_channels = [self.feat_channels] + list(conv_channels)
-            conv_strides = list(conv_strides)
-        for i in range(len(conv_strides)):
-            conv_before_pred.append(
-                ConvModule(
-                    conv_channels[i],
-                    conv_channels[i + 1],
-                    3,
-                    stride=conv_strides[i],
-                    padding=1,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    bias=self.conv_bias))
-
-        return conv_before_pred
-
-    def _init_predictor(self):
-        """Initialize predictor layers of the head."""
-        self.conv_cls_prev = self._init_branch(
-            conv_channels=self.cls_branch,
-            conv_strides=(1, ) * len(self.cls_branch))
-        self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
-                                  1)
-        self.conv_reg_prevs = nn.ModuleList()
-        self.conv_regs = nn.ModuleList()
-        for i in range(len(self.group_reg_dims)):
-            reg_dim = self.group_reg_dims[i]
-            reg_branch_channels = self.reg_branch[i]
-            out_channel = self.out_channels[i]
-            if len(reg_branch_channels) > 0:
-                self.conv_reg_prevs.append(
-                    self._init_branch(
-                        conv_channels=reg_branch_channels,
-                        conv_strides=(1, ) * len(reg_branch_channels)))
-                self.conv_regs.append(nn.Conv2d(out_channel, reg_dim, 1))
-            else:
-                self.conv_reg_prevs.append(None)
-                self.conv_regs.append(
-                    nn.Conv2d(self.feat_channels, reg_dim, 1))
-        if self.use_direction_classifier:
-            self.conv_dir_cls_prev = self._init_branch(
-                conv_channels=self.dir_branch,
-                conv_strides=(1, ) * len(self.dir_branch))
-            self.conv_dir_cls = nn.Conv2d(self.dir_branch[-1], 2, 1)
-        if self.pred_attrs:
-            self.conv_attr_prev = self._init_branch(
-                conv_channels=self.attr_branch,
-                conv_strides=(1, ) * len(self.attr_branch))
-            self.conv_attr = nn.Conv2d(self.attr_branch[-1], self.num_attrs, 1)
-
-    def init_weights(self):
-        """Initialize weights of the head.
-
-        We currently still use the customized defined init_weights because the
-        default init of DCN triggered by the init_cfg will init
-        conv_offset.weight, which mistakenly affects the training stability.
-        """
-        for modules in [self.cls_convs, self.reg_convs, self.conv_cls_prev]:
-            for m in modules:
-                if isinstance(m.conv, nn.Conv2d):
-                    normal_init(m.conv, std=0.01)
-        for conv_reg_prev in self.conv_reg_prevs:
-            if conv_reg_prev is None:
-                continue
-            for m in conv_reg_prev:
-                if isinstance(m.conv, nn.Conv2d):
-                    normal_init(m.conv, std=0.01)
-        if self.use_direction_classifier:
-            for m in self.conv_dir_cls_prev:
-                if isinstance(m.conv, nn.Conv2d):
-                    normal_init(m.conv, std=0.01)
-        if self.pred_attrs:
-            for m in self.conv_attr_prev:
-                if isinstance(m.conv, nn.Conv2d):
-                    normal_init(m.conv, std=0.01)
-        bias_cls = bias_init_with_prob(0.01)
-        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
-        for conv_reg in self.conv_regs:
-            normal_init(conv_reg, std=0.01)
-        if self.use_direction_classifier:
-            normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls)
-        if self.pred_attrs:
-            normal_init(self.conv_attr, std=0.01, bias=bias_cls)
-
-    def forward(self, feats):
-        """Forward features from the upstream network.
-
-        Args:
-            feats (tuple[Tensor]): Features from the upstream network, each is
-                a 4D-tensor.
-
-        Returns:
-            tuple: Usually contain classification scores, bbox predictions,
-                and direction class predictions.
-                cls_scores (list[Tensor]): Box scores for each scale level,
-                    each is a 4D-tensor, the channel number is
-                    num_points * num_classes.
-                bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                    level, each is a 4D-tensor, the channel number is
-                    num_points * bbox_code_size.
-                dir_cls_preds (list[Tensor]): Box scores for direction class
-                    predictions on each scale level, each is a 4D-tensor,
-                    the channel number is num_points * 2. (bin = 2)
-                attr_preds (list[Tensor]): Attribute scores for each scale
-                    level, each is a 4D-tensor, the channel number is
-                    num_points * num_attrs.
-        """
-        return multi_apply(self.forward_single, feats)[:5]
-
-    def forward_single(self, x):
-        """Forward features of a single scale level.
-
-        Args:
-            x (Tensor): FPN feature maps of the specified stride.
-
-        Returns:
-            tuple: Scores for each class, bbox predictions, direction class,
-                and attributes, features after classification and regression
-                conv layers, some models needs these features like FCOS.
-        """
-        cls_feat = x
-        reg_feat = x
-
-        for cls_layer in self.cls_convs:
-            cls_feat = cls_layer(cls_feat)
-        # clone the cls_feat for reusing the feature map afterwards
-        clone_cls_feat = cls_feat.clone()
-        for conv_cls_prev_layer in self.conv_cls_prev:
-            clone_cls_feat = conv_cls_prev_layer(clone_cls_feat)
-        cls_score = self.conv_cls(clone_cls_feat)
-
-        for reg_layer in self.reg_convs:
-            reg_feat = reg_layer(reg_feat)
-        bbox_pred = []
-        for i in range(len(self.group_reg_dims)):
-            # clone the reg_feat for reusing the feature map afterwards
-            clone_reg_feat = reg_feat.clone()
-            if len(self.reg_branch[i]) > 0:
-                for conv_reg_prev_layer in self.conv_reg_prevs[i]:
-                    clone_reg_feat = conv_reg_prev_layer(clone_reg_feat)
-            bbox_pred.append(self.conv_regs[i](clone_reg_feat))
-        bbox_pred = torch.cat(bbox_pred, dim=1)
-
-        dir_cls_pred = None
-        if self.use_direction_classifier:
-            clone_reg_feat = reg_feat.clone()
-            for conv_dir_cls_prev_layer in self.conv_dir_cls_prev:
-                clone_reg_feat = conv_dir_cls_prev_layer(clone_reg_feat)
-            dir_cls_pred = self.conv_dir_cls(clone_reg_feat)
-
-        attr_pred = None
-        if self.pred_attrs:
-            # clone the cls_feat for reusing the feature map afterwards
-            clone_cls_feat = cls_feat.clone()
-            for conv_attr_prev_layer in self.conv_attr_prev:
-                clone_cls_feat = conv_attr_prev_layer(clone_cls_feat)
-            attr_pred = self.conv_attr(clone_cls_feat)
-
-        return cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, \
-            reg_feat
-
-    @abstractmethod
-    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             dir_cls_preds,
-             attr_preds,
-             gt_bboxes,
-             gt_labels,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             centers2d,
-             depths,
-             attr_labels,
-             img_metas,
-             gt_bboxes_ignore=None):
-        """Compute loss of the head.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level,
-                each is a 4D-tensor, the channel number is
-                num_points * num_classes.
-            bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                level, each is a 4D-tensor, the channel number is
-                num_points * bbox_code_size.
-            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * 2. (bin = 2)
-            attr_preds (list[Tensor]): Box scores for each scale level,
-                each is a 4D-tensor, the channel number is
-                num_points * num_attrs.
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[Tensor]): class indices corresponding to each box
-            gt_bboxes_3d (list[Tensor]): 3D Ground truth bboxes for each
-                image with shape (num_gts, bbox_code_size).
-            gt_labels_3d (list[Tensor]): 3D class indices of each box.
-            centers2d (list[Tensor]): Projected 3D centers onto 2D images.
-            depths (list[Tensor]): Depth of projected centers on 2D images.
-            attr_labels (list[Tensor], optional): Attribute indices
-                corresponding to each box
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            gt_bboxes_ignore (list[Tensor]): specify which bounding
-                boxes can be ignored when computing the loss.
-        """
-
-        raise NotImplementedError
-
-    @abstractmethod
-    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
-    def get_bboxes(self,
-                   cls_scores,
-                   bbox_preds,
-                   dir_cls_preds,
-                   attr_preds,
-                   img_metas,
-                   cfg=None,
-                   rescale=None):
-        """Transform network output for a batch into bbox predictions.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level
-                Has shape (N, num_points * num_classes, H, W)
-            bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                level with shape (N, num_points * bbox_code_size, H, W)
-            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * 2. (bin = 2)
-            attr_preds (list[Tensor]): Attribute scores for each scale level
-                Has shape (N, num_points * num_attrs, H, W)
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            cfg (mmcv.Config): Test / postprocessing configuration,
-                if None, test_cfg would be used
-            rescale (bool): If True, return boxes in original image space
-        """
-
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_targets(self, points, gt_bboxes_list, gt_labels_list,
-                    gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
-                    depths_list, attr_labels_list):
-        """Compute regression, classification and centerss targets for points
-        in multiple images.
-
-        Args:
-            points (list[Tensor]): Points of each fpn level, each has shape
-                (num_points, 2).
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
-                each has shape (num_gt, 4).
-            gt_labels_list (list[Tensor]): Ground truth labels of each box,
-                each has shape (num_gt,).
-            gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
-                image, each has shape (num_gt, bbox_code_size).
-            gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
-                box, each has shape (num_gt,).
-            centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
-                each has shape (num_gt, 2).
-            depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
-                image, each has shape (num_gt, 1).
-            attr_labels_list (list[Tensor]): Attribute labels of each box,
-                each has shape (num_gt,).
-        """
-        raise NotImplementedError
-
-    def _get_points_single(self,
-                           featmap_size,
-                           stride,
-                           dtype,
-                           device,
-                           flatten=False):
-        """Get points of a single scale level."""
-        h, w = featmap_size
-        x_range = torch.arange(w, dtype=dtype, device=device)
-        y_range = torch.arange(h, dtype=dtype, device=device)
-        y, x = torch.meshgrid(y_range, x_range)
-        if flatten:
-            y = y.flatten()
-            x = x.flatten()
-        return y, x
-
-    def get_points(self, featmap_sizes, dtype, device, flatten=False):
-        """Get points according to feature map sizes.
-
-        Args:
-            featmap_sizes (list[tuple]): Multi-level feature map sizes.
-            dtype (torch.dtype): Type of points.
-            device (torch.device): Device of points.
-
-        Returns:
-            tuple: points of each image.
-        """
-        mlvl_points = []
-        for i in range(len(featmap_sizes)):
-            mlvl_points.append(
-                self._get_points_single(featmap_sizes[i], self.strides[i],
-                                        dtype, device, flatten))
-        return mlvl_points
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+
+import torch
+from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
+from mmcv.runner import force_fp32
+from torch import nn as nn
+
+from mmdet.core import multi_apply
+from ..builder import HEADS, build_loss
+from .base_mono3d_dense_head import BaseMono3DDenseHead
+
+
+@HEADS.register_module()
+class AnchorFreeMono3DHead(BaseMono3DDenseHead):
+    """Anchor-free head for monocular 3D object detection.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int, optional): Number of hidden channels.
+            Used in child classes. Defaults to 256.
+        stacked_convs (int, optional): Number of stacking convs of the head.
+        strides (tuple, optional): Downsample factor of each feature map.
+        dcn_on_last_conv (bool, optional): If true, use dcn in the last
+            layer of towers. Default: False.
+        conv_bias (bool | str, optional): If specified as `auto`, it will be
+            decided by the norm_cfg. Bias of conv will be set as True
+            if `norm_cfg` is None, otherwise False. Default: 'auto'.
+        background_label (int, optional): Label ID of background,
+            set as 0 for RPN and num_classes for other heads.
+            It will automatically set as `num_classes` if None is given.
+        use_direction_classifier (bool, optional):
+            Whether to add a direction classifier.
+        diff_rad_by_sin (bool, optional): Whether to change the difference
+            into sin difference for box regression loss. Defaults to True.
+        dir_offset (float, optional): Parameter used in direction
+            classification. Defaults to 0.
+        dir_limit_offset (float, optional): Parameter used in direction
+            classification. Defaults to 0.
+        loss_cls (dict, optional): Config of classification loss.
+        loss_bbox (dict, optional): Config of localization loss.
+        loss_dir (dict, optional): Config of direction classifier loss.
+        loss_attr (dict, optional): Config of attribute classifier loss,
+            which is only active when `pred_attrs=True`.
+        bbox_code_size (int, optional): Dimensions of predicted bounding boxes.
+        pred_attrs (bool, optional): Whether to predict attributes.
+            Defaults to False.
+        num_attrs (int, optional): The number of attributes to be predicted.
+            Default: 9.
+        pred_velo (bool, optional): Whether to predict velocity.
+            Defaults to False.
+        pred_bbox2d (bool, optional): Whether to predict 2D boxes.
+            Defaults to False.
+        group_reg_dims (tuple[int], optional): The dimension of each regression
+            target group. Default: (2, 1, 3, 1, 2).
+        cls_branch (tuple[int], optional): Channels for classification branch.
+            Default: (128, 64).
+        reg_branch (tuple[tuple], optional): Channels for regression branch.
+            Default: (
+                (128, 64),  # offset
+                (128, 64),  # depth
+                (64, ),  # size
+                (64, ),  # rot
+                ()  # velo
+            ),
+        dir_branch (tuple[int], optional): Channels for direction
+            classification branch. Default: (64, ).
+        attr_branch (tuple[int], optional): Channels for classification branch.
+            Default: (64, ).
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        train_cfg (dict, optional): Training config of anchor head.
+        test_cfg (dict, optional): Testing config of anchor head.
+    """  # noqa: W605
+
+    _version = 1
+
+    def __init__(
+            self,
+            num_classes,
+            in_channels,
+            feat_channels=256,
+            stacked_convs=4,
+            strides=(4, 8, 16, 32, 64),
+            dcn_on_last_conv=False,
+            conv_bias='auto',
+            background_label=None,
+            use_direction_classifier=True,
+            diff_rad_by_sin=True,
+            dir_offset=0,
+            dir_limit_offset=0,
+            loss_cls=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_bbox=dict(
+                type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+            loss_dir=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_attr=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            bbox_code_size=9,  # For nuscenes
+            pred_attrs=False,
+            num_attrs=9,  # For nuscenes
+            pred_velo=False,
+            pred_bbox2d=False,
+            group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo,
+            cls_branch=(128, 64),
+            reg_branch=(
+                (128, 64),  # offset
+                (128, 64),  # depth
+                (64, ),  # size
+                (64, ),  # rot
+                ()  # velo
+            ),
+            dir_branch=(64, ),
+            attr_branch=(64, ),
+            conv_cfg=None,
+            norm_cfg=None,
+            train_cfg=None,
+            test_cfg=None,
+            init_cfg=None):
+        super(AnchorFreeMono3DHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.cls_out_channels = num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.use_direction_classifier = use_direction_classifier
+        self.diff_rad_by_sin = diff_rad_by_sin
+        self.dir_offset = dir_offset
+        self.dir_limit_offset = dir_limit_offset
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_dir = build_loss(loss_dir)
+        self.bbox_code_size = bbox_code_size
+        self.group_reg_dims = list(group_reg_dims)
+        self.cls_branch = cls_branch
+        self.reg_branch = reg_branch
+        assert len(reg_branch) == len(group_reg_dims), 'The number of '\
+            'element in reg_branch and group_reg_dims should be the same.'
+        self.pred_velo = pred_velo
+        self.pred_bbox2d = pred_bbox2d
+        self.out_channels = []
+        for reg_branch_channels in reg_branch:
+            if len(reg_branch_channels) > 0:
+                self.out_channels.append(reg_branch_channels[-1])
+            else:
+                self.out_channels.append(-1)
+        self.dir_branch = dir_branch
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+        self.background_label = (
+            num_classes if background_label is None else background_label)
+        # background_label should be either 0 or num_classes
+        assert (self.background_label == 0
+                or self.background_label == num_classes)
+        self.pred_attrs = pred_attrs
+        self.attr_background_label = -1
+        self.num_attrs = num_attrs
+        if self.pred_attrs:
+            self.attr_background_label = num_attrs
+            self.loss_attr = build_loss(loss_attr)
+            self.attr_branch = attr_branch
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self._init_cls_convs()
+        self._init_reg_convs()
+        self._init_predictor()
+
+    def _init_cls_convs(self):
+        """Initialize classification conv layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_reg_convs(self):
+        """Initialize bbox regression conv layers of the head."""
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_branch(self, conv_channels=(64), conv_strides=(1)):
+        """Initialize conv layers as a prediction branch."""
+        conv_before_pred = nn.ModuleList()
+        if isinstance(conv_channels, int):
+            conv_channels = [self.feat_channels] + [conv_channels]
+            conv_strides = [conv_strides]
+        else:
+            conv_channels = [self.feat_channels] + list(conv_channels)
+            conv_strides = list(conv_strides)
+        for i in range(len(conv_strides)):
+            conv_before_pred.append(
+                ConvModule(
+                    conv_channels[i],
+                    conv_channels[i + 1],
+                    3,
+                    stride=conv_strides[i],
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+        return conv_before_pred
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls_prev = self._init_branch(
+            conv_channels=self.cls_branch,
+            conv_strides=(1, ) * len(self.cls_branch))
+        self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
+                                  1)
+        self.conv_reg_prevs = nn.ModuleList()
+        self.conv_regs = nn.ModuleList()
+        for i in range(len(self.group_reg_dims)):
+            reg_dim = self.group_reg_dims[i]
+            reg_branch_channels = self.reg_branch[i]
+            out_channel = self.out_channels[i]
+            if len(reg_branch_channels) > 0:
+                self.conv_reg_prevs.append(
+                    self._init_branch(
+                        conv_channels=reg_branch_channels,
+                        conv_strides=(1, ) * len(reg_branch_channels)))
+                self.conv_regs.append(nn.Conv2d(out_channel, reg_dim, 1))
+            else:
+                self.conv_reg_prevs.append(None)
+                self.conv_regs.append(
+                    nn.Conv2d(self.feat_channels, reg_dim, 1))
+        if self.use_direction_classifier:
+            self.conv_dir_cls_prev = self._init_branch(
+                conv_channels=self.dir_branch,
+                conv_strides=(1, ) * len(self.dir_branch))
+            self.conv_dir_cls = nn.Conv2d(self.dir_branch[-1], 2, 1)
+        if self.pred_attrs:
+            self.conv_attr_prev = self._init_branch(
+                conv_channels=self.attr_branch,
+                conv_strides=(1, ) * len(self.attr_branch))
+            self.conv_attr = nn.Conv2d(self.attr_branch[-1], self.num_attrs, 1)
+
+    def init_weights(self):
+        """Initialize weights of the head.
+
+        We currently still use the customized defined init_weights because the
+        default init of DCN triggered by the init_cfg will init
+        conv_offset.weight, which mistakenly affects the training stability.
+        """
+        for modules in [self.cls_convs, self.reg_convs, self.conv_cls_prev]:
+            for m in modules:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+        for conv_reg_prev in self.conv_reg_prevs:
+            if conv_reg_prev is None:
+                continue
+            for m in conv_reg_prev:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+        if self.use_direction_classifier:
+            for m in self.conv_dir_cls_prev:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+        if self.pred_attrs:
+            for m in self.conv_attr_prev:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.conv_cls, std=0.01, bias=bias_cls)
+        for conv_reg in self.conv_regs:
+            normal_init(conv_reg, std=0.01)
+        if self.use_direction_classifier:
+            normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls)
+        if self.pred_attrs:
+            normal_init(self.conv_attr, std=0.01, bias=bias_cls)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually contain classification scores, bbox predictions,
+                and direction class predictions.
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+                dir_cls_preds (list[Tensor]): Box scores for direction class
+                    predictions on each scale level, each is a 4D-tensor,
+                    the channel number is num_points * 2. (bin = 2)
+                attr_preds (list[Tensor]): Attribute scores for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * num_attrs.
+        """
+        return multi_apply(self.forward_single, feats)[:5]
+
+    def forward_single(self, x):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions, direction class,
+                and attributes, features after classification and regression
+                conv layers, some models needs these features like FCOS.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        # clone the cls_feat for reusing the feature map afterwards
+        clone_cls_feat = cls_feat.clone()
+        for conv_cls_prev_layer in self.conv_cls_prev:
+            clone_cls_feat = conv_cls_prev_layer(clone_cls_feat)
+        cls_score = self.conv_cls(clone_cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = []
+        for i in range(len(self.group_reg_dims)):
+            # clone the reg_feat for reusing the feature map afterwards
+            clone_reg_feat = reg_feat.clone()
+            if len(self.reg_branch[i]) > 0:
+                for conv_reg_prev_layer in self.conv_reg_prevs[i]:
+                    clone_reg_feat = conv_reg_prev_layer(clone_reg_feat)
+            bbox_pred.append(self.conv_regs[i](clone_reg_feat))
+        bbox_pred = torch.cat(bbox_pred, dim=1)
+
+        dir_cls_pred = None
+        if self.use_direction_classifier:
+            clone_reg_feat = reg_feat.clone()
+            for conv_dir_cls_prev_layer in self.conv_dir_cls_prev:
+                clone_reg_feat = conv_dir_cls_prev_layer(clone_reg_feat)
+            dir_cls_pred = self.conv_dir_cls(clone_reg_feat)
+
+        attr_pred = None
+        if self.pred_attrs:
+            # clone the cls_feat for reusing the feature map afterwards
+            clone_cls_feat = cls_feat.clone()
+            for conv_attr_prev_layer in self.conv_attr_prev:
+                clone_cls_feat = conv_attr_prev_layer(clone_cls_feat)
+            attr_pred = self.conv_attr(clone_cls_feat)
+
+        return cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, \
+            reg_feat
+
+    @abstractmethod
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             dir_cls_preds,
+             attr_preds,
+             gt_bboxes,
+             gt_labels,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             centers2d,
+             depths,
+             attr_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * bbox_code_size.
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            attr_preds (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_attrs.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_3d (list[Tensor]): 3D Ground truth bboxes for each
+                image with shape (num_gts, bbox_code_size).
+            gt_labels_3d (list[Tensor]): 3D class indices of each box.
+            centers2d (list[Tensor]): Projected 3D centers onto 2D images.
+            depths (list[Tensor]): Depth of projected centers on 2D images.
+            attr_labels (list[Tensor], optional): Attribute indices
+                corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   dir_cls_preds,
+                   attr_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=None):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * bbox_code_size, H, W)
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            attr_preds (list[Tensor]): Attribute scores for each scale level
+                Has shape (N, num_points * num_attrs, H, W)
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_targets(self, points, gt_bboxes_list, gt_labels_list,
+                    gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
+                    depths_list, attr_labels_list):
+        """Compute regression, classification and centerss targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+            gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
+                image, each has shape (num_gt, bbox_code_size).
+            gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
+                box, each has shape (num_gt,).
+            centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
+                each has shape (num_gt, 2).
+            depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
+                image, each has shape (num_gt, 1).
+            attr_labels_list (list[Tensor]): Attribute labels of each box,
+                each has shape (num_gt,).
+        """
+        raise NotImplementedError
+
+    def _get_points_single(self,
+                           featmap_size,
+                           stride,
+                           dtype,
+                           device,
+                           flatten=False):
+        """Get points of a single scale level."""
+        h, w = featmap_size
+        x_range = torch.arange(w, dtype=dtype, device=device)
+        y_range = torch.arange(h, dtype=dtype, device=device)
+        y, x = torch.meshgrid(y_range, x_range)
+        if flatten:
+            y = y.flatten()
+            x = x.flatten()
+        return y, x
+
+    def get_points(self, featmap_sizes, dtype, device, flatten=False):
+        """Get points according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            dtype (torch.dtype): Type of points.
+            device (torch.device): Device of points.
+
+        Returns:
+            tuple: points of each image.
+        """
+        mlvl_points = []
+        for i in range(len(featmap_sizes)):
+            mlvl_points.append(
+                self._get_points_single(featmap_sizes[i], self.strides[i],
+                                        dtype, device, flatten))
+        return mlvl_points
diff --git a/mmdet3d/models/dense_heads/base_conv_bbox_head.py b/mmdet3d/models/dense_heads/base_conv_bbox_head.py
index ec5eaa6..439dd7a 100644
--- a/mmdet3d/models/dense_heads/base_conv_bbox_head.py
+++ b/mmdet3d/models/dense_heads/base_conv_bbox_head.py
@@ -1,131 +1,131 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn import ConvModule
-from mmcv.cnn.bricks import build_conv_layer
-from mmcv.runner import BaseModule
-from torch import nn as nn
-
-from ..builder import HEADS
-
-
-@HEADS.register_module()
-class BaseConvBboxHead(BaseModule):
-    r"""More general bbox head, with shared conv layers and two optional
-    separated branches.
-
-    .. code-block:: none
-
-                     /-> cls convs -> cls_score
-        shared convs
-                     \-> reg convs -> bbox_pred
-    """
-
-    def __init__(self,
-                 in_channels=0,
-                 shared_conv_channels=(),
-                 cls_conv_channels=(),
-                 num_cls_out_channels=0,
-                 reg_conv_channels=(),
-                 num_reg_out_channels=0,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 bias='auto',
-                 init_cfg=None,
-                 *args,
-                 **kwargs):
-        super(BaseConvBboxHead, self).__init__(
-            init_cfg=init_cfg, *args, **kwargs)
-        assert in_channels > 0
-        assert num_cls_out_channels > 0
-        assert num_reg_out_channels > 0
-        self.in_channels = in_channels
-        self.shared_conv_channels = shared_conv_channels
-        self.cls_conv_channels = cls_conv_channels
-        self.num_cls_out_channels = num_cls_out_channels
-        self.reg_conv_channels = reg_conv_channels
-        self.num_reg_out_channels = num_reg_out_channels
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.bias = bias
-
-        # add shared convs
-        if len(self.shared_conv_channels) > 0:
-            self.shared_convs = self._add_conv_branch(
-                self.in_channels, self.shared_conv_channels)
-            out_channels = self.shared_conv_channels[-1]
-        else:
-            out_channels = self.in_channels
-
-        # add cls specific branch
-        prev_channel = out_channels
-        if len(self.cls_conv_channels) > 0:
-            self.cls_convs = self._add_conv_branch(prev_channel,
-                                                   self.cls_conv_channels)
-            prev_channel = self.cls_conv_channels[-1]
-
-        self.conv_cls = build_conv_layer(
-            conv_cfg,
-            in_channels=prev_channel,
-            out_channels=num_cls_out_channels,
-            kernel_size=1)
-        # add reg specific branch
-        prev_channel = out_channels
-        if len(self.reg_conv_channels) > 0:
-            self.reg_convs = self._add_conv_branch(prev_channel,
-                                                   self.reg_conv_channels)
-            prev_channel = self.reg_conv_channels[-1]
-
-        self.conv_reg = build_conv_layer(
-            conv_cfg,
-            in_channels=prev_channel,
-            out_channels=num_reg_out_channels,
-            kernel_size=1)
-
-    def _add_conv_branch(self, in_channels, conv_channels):
-        """Add shared or separable branch."""
-        conv_spec = [in_channels] + list(conv_channels)
-        # add branch specific conv layers
-        conv_layers = nn.Sequential()
-        for i in range(len(conv_spec) - 1):
-            conv_layers.add_module(
-                f'layer{i}',
-                ConvModule(
-                    conv_spec[i],
-                    conv_spec[i + 1],
-                    kernel_size=1,
-                    padding=0,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
-                    bias=self.bias,
-                    inplace=True))
-        return conv_layers
-
-    def forward(self, feats):
-        """Forward.
-
-        Args:
-            feats (Tensor): Input features
-
-        Returns:
-            Tensor: Class scores predictions
-            Tensor: Regression predictions
-        """
-        # shared part
-        if len(self.shared_conv_channels) > 0:
-            x = self.shared_convs(feats)
-
-        # separate branches
-        x_cls = x
-        x_reg = x
-
-        if len(self.cls_conv_channels) > 0:
-            x_cls = self.cls_convs(x_cls)
-        cls_score = self.conv_cls(x_cls)
-
-        if len(self.reg_conv_channels) > 0:
-            x_reg = self.reg_convs(x_reg)
-        bbox_pred = self.conv_reg(x_reg)
-
-        return cls_score, bbox_pred
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule
+from mmcv.cnn.bricks import build_conv_layer
+from mmcv.runner import BaseModule
+from torch import nn as nn
+
+from ..builder import HEADS
+
+
+@HEADS.register_module()
+class BaseConvBboxHead(BaseModule):
+    r"""More general bbox head, with shared conv layers and two optional
+    separated branches.
+
+    .. code-block:: none
+
+                     /-> cls convs -> cls_score
+        shared convs
+                     \-> reg convs -> bbox_pred
+    """
+
+    def __init__(self,
+                 in_channels=0,
+                 shared_conv_channels=(),
+                 cls_conv_channels=(),
+                 num_cls_out_channels=0,
+                 reg_conv_channels=(),
+                 num_reg_out_channels=0,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 act_cfg=dict(type='ReLU'),
+                 bias='auto',
+                 init_cfg=None,
+                 *args,
+                 **kwargs):
+        super(BaseConvBboxHead, self).__init__(
+            init_cfg=init_cfg, *args, **kwargs)
+        assert in_channels > 0
+        assert num_cls_out_channels > 0
+        assert num_reg_out_channels > 0
+        self.in_channels = in_channels
+        self.shared_conv_channels = shared_conv_channels
+        self.cls_conv_channels = cls_conv_channels
+        self.num_cls_out_channels = num_cls_out_channels
+        self.reg_conv_channels = reg_conv_channels
+        self.num_reg_out_channels = num_reg_out_channels
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.bias = bias
+
+        # add shared convs
+        if len(self.shared_conv_channels) > 0:
+            self.shared_convs = self._add_conv_branch(
+                self.in_channels, self.shared_conv_channels)
+            out_channels = self.shared_conv_channels[-1]
+        else:
+            out_channels = self.in_channels
+
+        # add cls specific branch
+        prev_channel = out_channels
+        if len(self.cls_conv_channels) > 0:
+            self.cls_convs = self._add_conv_branch(prev_channel,
+                                                   self.cls_conv_channels)
+            prev_channel = self.cls_conv_channels[-1]
+
+        self.conv_cls = build_conv_layer(
+            conv_cfg,
+            in_channels=prev_channel,
+            out_channels=num_cls_out_channels,
+            kernel_size=1)
+        # add reg specific branch
+        prev_channel = out_channels
+        if len(self.reg_conv_channels) > 0:
+            self.reg_convs = self._add_conv_branch(prev_channel,
+                                                   self.reg_conv_channels)
+            prev_channel = self.reg_conv_channels[-1]
+
+        self.conv_reg = build_conv_layer(
+            conv_cfg,
+            in_channels=prev_channel,
+            out_channels=num_reg_out_channels,
+            kernel_size=1)
+
+    def _add_conv_branch(self, in_channels, conv_channels):
+        """Add shared or separable branch."""
+        conv_spec = [in_channels] + list(conv_channels)
+        # add branch specific conv layers
+        conv_layers = nn.Sequential()
+        for i in range(len(conv_spec) - 1):
+            conv_layers.add_module(
+                f'layer{i}',
+                ConvModule(
+                    conv_spec[i],
+                    conv_spec[i + 1],
+                    kernel_size=1,
+                    padding=0,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=self.bias,
+                    inplace=True))
+        return conv_layers
+
+    def forward(self, feats):
+        """Forward.
+
+        Args:
+            feats (Tensor): Input features
+
+        Returns:
+            Tensor: Class scores predictions
+            Tensor: Regression predictions
+        """
+        # shared part
+        if len(self.shared_conv_channels) > 0:
+            x = self.shared_convs(feats)
+
+        # separate branches
+        x_cls = x
+        x_reg = x
+
+        if len(self.cls_conv_channels) > 0:
+            x_cls = self.cls_convs(x_cls)
+        cls_score = self.conv_cls(x_cls)
+
+        if len(self.reg_conv_channels) > 0:
+            x_reg = self.reg_convs(x_reg)
+        bbox_pred = self.conv_reg(x_reg)
+
+        return cls_score, bbox_pred
diff --git a/mmdet3d/models/dense_heads/base_mono3d_dense_head.py b/mmdet3d/models/dense_heads/base_mono3d_dense_head.py
index 2444473..bac424b 100644
--- a/mmdet3d/models/dense_heads/base_mono3d_dense_head.py
+++ b/mmdet3d/models/dense_heads/base_mono3d_dense_head.py
@@ -1,78 +1,78 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import ABCMeta, abstractmethod
-
-from mmcv.runner import BaseModule
-
-
-class BaseMono3DDenseHead(BaseModule, metaclass=ABCMeta):
-    """Base class for Monocular 3D DenseHeads."""
-
-    def __init__(self, init_cfg=None):
-        super(BaseMono3DDenseHead, self).__init__(init_cfg=init_cfg)
-
-    @abstractmethod
-    def loss(self, **kwargs):
-        """Compute losses of the head."""
-        pass
-
-    @abstractmethod
-    def get_bboxes(self, **kwargs):
-        """Transform network output for a batch into bbox predictions."""
-        pass
-
-    def forward_train(self,
-                      x,
-                      img_metas,
-                      gt_bboxes,
-                      gt_labels=None,
-                      gt_bboxes_3d=None,
-                      gt_labels_3d=None,
-                      centers2d=None,
-                      depths=None,
-                      attr_labels=None,
-                      gt_bboxes_ignore=None,
-                      proposal_cfg=None,
-                      **kwargs):
-        """
-        Args:
-            x (list[Tensor]): Features from FPN.
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
-                shape (num_gts, 4).
-            gt_labels (list[Tensor]): Ground truth labels of each box,
-                shape (num_gts,).
-            gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
-                shape (num_gts, self.bbox_code_size).
-            gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
-                shape (num_gts,).
-            centers2d (list[Tensor]): Projected 3D center of each box,
-                shape (num_gts, 2).
-            depths (list[Tensor]): Depth of projected 3D center of each box,
-                shape (num_gts,).
-            attr_labels (list[Tensor]): Attribute labels of each box,
-                shape (num_gts,).
-            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
-                ignored, shape (num_ignored_gts, 4).
-            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
-                if None, test_cfg would be used
-
-        Returns:
-            tuple:
-                losses: (dict[str, Tensor]): A dictionary of loss components.
-                proposal_list (list[Tensor]): Proposals of each image.
-        """
-        outs = self(x)
-        if gt_labels is None:
-            loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths,
-                                  attr_labels, img_metas)
-        else:
-            loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d,
-                                  gt_labels_3d, centers2d, depths, attr_labels,
-                                  img_metas)
-        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
-        if proposal_cfg is None:
-            return losses
-        else:
-            proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
-            return losses, proposal_list
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.runner import BaseModule
+
+
+class BaseMono3DDenseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for Monocular 3D DenseHeads."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseMono3DDenseHead, self).__init__(init_cfg=init_cfg)
+
+    @abstractmethod
+    def loss(self, **kwargs):
+        """Compute losses of the head."""
+        pass
+
+    @abstractmethod
+    def get_bboxes(self, **kwargs):
+        """Transform network output for a batch into bbox predictions."""
+        pass
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      centers2d=None,
+                      depths=None,
+                      attr_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
+                shape (num_gts, self.bbox_code_size).
+            gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
+                shape (num_gts,).
+            centers2d (list[Tensor]): Projected 3D center of each box,
+                shape (num_gts, 2).
+            depths (list[Tensor]): Depth of projected 3D center of each box,
+                shape (num_gts,).
+            attr_labels (list[Tensor]): Attribute labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+
+        Returns:
+            tuple:
+                losses: (dict[str, Tensor]): A dictionary of loss components.
+                proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outs = self(x)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths,
+                                  attr_labels, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d,
+                                  gt_labels_3d, centers2d, depths, attr_labels,
+                                  img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        if proposal_cfg is None:
+            return losses
+        else:
+            proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
+            return losses, proposal_list
diff --git a/mmdet3d/models/dense_heads/centerpoint_head.py b/mmdet3d/models/dense_heads/centerpoint_head.py
index 2cf758b..3b036b6 100644
--- a/mmdet3d/models/dense_heads/centerpoint_head.py
+++ b/mmdet3d/models/dense_heads/centerpoint_head.py
@@ -1,830 +1,830 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import torch
-from mmcv.cnn import ConvModule, build_conv_layer
-from mmcv.runner import BaseModule, force_fp32
-from torch import nn
-
-from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius,
-                          xywhr2xyxyr)
-from mmdet3d.core.post_processing import nms_bev
-from mmdet3d.models import builder
-from mmdet3d.models.utils import clip_sigmoid
-from mmdet.core import build_bbox_coder, multi_apply
-from ..builder import HEADS, build_loss
-
-
-@HEADS.register_module()
-class SeparateHead(BaseModule):
-    """SeparateHead for CenterHead.
-
-    Args:
-        in_channels (int): Input channels for conv_layer.
-        heads (dict): Conv information.
-        head_conv (int, optional): Output channels.
-            Default: 64.
-        final_kernel (int, optional): Kernel size for the last conv layer.
-            Default: 1.
-        init_bias (float, optional): Initial bias. Default: -2.19.
-        conv_cfg (dict, optional): Config of conv layer.
-            Default: dict(type='Conv2d')
-        norm_cfg (dict, optional): Config of norm layer.
-            Default: dict(type='BN2d').
-        bias (str, optional): Type of bias. Default: 'auto'.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 heads,
-                 head_conv=64,
-                 final_kernel=1,
-                 init_bias=-2.19,
-                 conv_cfg=dict(type='Conv2d'),
-                 norm_cfg=dict(type='BN2d'),
-                 bias='auto',
-                 init_cfg=None,
-                 **kwargs):
-        assert init_cfg is None, 'To prevent abnormal initialization ' \
-            'behavior, init_cfg is not allowed to be set'
-        super(SeparateHead, self).__init__(init_cfg=init_cfg)
-        self.heads = heads
-        self.init_bias = init_bias
-        for head in self.heads:
-            classes, num_conv = self.heads[head]
-
-            conv_layers = []
-            c_in = in_channels
-            for i in range(num_conv - 1):
-                conv_layers.append(
-                    ConvModule(
-                        c_in,
-                        head_conv,
-                        kernel_size=final_kernel,
-                        stride=1,
-                        padding=final_kernel // 2,
-                        bias=bias,
-                        conv_cfg=conv_cfg,
-                        norm_cfg=norm_cfg))
-                c_in = head_conv
-
-            conv_layers.append(
-                build_conv_layer(
-                    conv_cfg,
-                    head_conv,
-                    classes,
-                    kernel_size=final_kernel,
-                    stride=1,
-                    padding=final_kernel // 2,
-                    bias=True))
-            conv_layers = nn.Sequential(*conv_layers)
-
-            self.__setattr__(head, conv_layers)
-
-            if init_cfg is None:
-                self.init_cfg = dict(type='Kaiming', layer='Conv2d')
-
-    def init_weights(self):
-        """Initialize weights."""
-        super().init_weights()
-        for head in self.heads:
-            if head == 'heatmap':
-                self.__getattr__(head)[-1].bias.data.fill_(self.init_bias)
-
-    def forward(self, x):
-        """Forward function for SepHead.
-
-        Args:
-            x (torch.Tensor): Input feature map with the shape of
-                [B, 512, 128, 128].
-
-        Returns:
-            dict[str: torch.Tensor]: contains the following keys:
-
-                -reg （torch.Tensor): 2D regression value with the
-                    shape of [B, 2, H, W].
-                -height (torch.Tensor): Height value with the
-                    shape of [B, 1, H, W].
-                -dim (torch.Tensor): Size value with the shape
-                    of [B, 3, H, W].
-                -rot (torch.Tensor): Rotation value with the
-                    shape of [B, 2, H, W].
-                -vel (torch.Tensor): Velocity value with the
-                    shape of [B, 2, H, W].
-                -heatmap (torch.Tensor): Heatmap with the shape of
-                    [B, N, H, W].
-        """
-        ret_dict = dict()
-        for head in self.heads:
-            ret_dict[head] = self.__getattr__(head)(x)
-
-        return ret_dict
-
-
-@HEADS.register_module()
-class DCNSeparateHead(BaseModule):
-    r"""DCNSeparateHead for CenterHead.
-
-    .. code-block:: none
-            /-----> DCN for heatmap task -----> heatmap task.
-    feature
-            \-----> DCN for regression tasks -----> regression tasks
-
-    Args:
-        in_channels (int): Input channels for conv_layer.
-        num_cls (int): Number of classes.
-        heads (dict): Conv information.
-        dcn_config (dict): Config of dcn layer.
-        head_conv (int, optional): Output channels.
-            Default: 64.
-        final_kernel (int, optional): Kernel size for the last conv
-            layer. Default: 1.
-        init_bias (float, optional): Initial bias. Default: -2.19.
-        conv_cfg (dict, optional): Config of conv layer.
-            Default: dict(type='Conv2d')
-        norm_cfg (dict, optional): Config of norm layer.
-            Default: dict(type='BN2d').
-        bias (str, optional): Type of bias. Default: 'auto'.
-    """  # noqa: W605
-
-    def __init__(self,
-                 in_channels,
-                 num_cls,
-                 heads,
-                 dcn_config,
-                 head_conv=64,
-                 final_kernel=1,
-                 init_bias=-2.19,
-                 conv_cfg=dict(type='Conv2d'),
-                 norm_cfg=dict(type='BN2d'),
-                 bias='auto',
-                 init_cfg=None,
-                 **kwargs):
-        assert init_cfg is None, 'To prevent abnormal initialization ' \
-            'behavior, init_cfg is not allowed to be set'
-        super(DCNSeparateHead, self).__init__(init_cfg=init_cfg)
-        if 'heatmap' in heads:
-            heads.pop('heatmap')
-        # feature adaptation with dcn
-        # use separate features for classification / regression
-        self.feature_adapt_cls = build_conv_layer(dcn_config)
-
-        self.feature_adapt_reg = build_conv_layer(dcn_config)
-
-        # heatmap prediction head
-        cls_head = [
-            ConvModule(
-                in_channels,
-                head_conv,
-                kernel_size=3,
-                padding=1,
-                conv_cfg=conv_cfg,
-                bias=bias,
-                norm_cfg=norm_cfg),
-            build_conv_layer(
-                conv_cfg,
-                head_conv,
-                num_cls,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                bias=bias)
-        ]
-        self.cls_head = nn.Sequential(*cls_head)
-        self.init_bias = init_bias
-        # other regression target
-        self.task_head = SeparateHead(
-            in_channels,
-            heads,
-            head_conv=head_conv,
-            final_kernel=final_kernel,
-            bias=bias)
-        if init_cfg is None:
-            self.init_cfg = dict(type='Kaiming', layer='Conv2d')
-
-    def init_weights(self):
-        """Initialize weights."""
-        super().init_weights()
-        self.cls_head[-1].bias.data.fill_(self.init_bias)
-
-    def forward(self, x):
-        """Forward function for DCNSepHead.
-
-        Args:
-            x (torch.Tensor): Input feature map with the shape of
-                [B, 512, 128, 128].
-
-        Returns:
-            dict[str: torch.Tensor]: contains the following keys:
-
-                -reg （torch.Tensor): 2D regression value with the
-                    shape of [B, 2, H, W].
-                -height (torch.Tensor): Height value with the
-                    shape of [B, 1, H, W].
-                -dim (torch.Tensor): Size value with the shape
-                    of [B, 3, H, W].
-                -rot (torch.Tensor): Rotation value with the
-                    shape of [B, 2, H, W].
-                -vel (torch.Tensor): Velocity value with the
-                    shape of [B, 2, H, W].
-                -heatmap (torch.Tensor): Heatmap with the shape of
-                    [B, N, H, W].
-        """
-        center_feat = self.feature_adapt_cls(x)
-        reg_feat = self.feature_adapt_reg(x)
-
-        cls_score = self.cls_head(center_feat)
-        ret = self.task_head(reg_feat)
-        ret['heatmap'] = cls_score
-
-        return ret
-
-
-@HEADS.register_module()
-class CenterHead(BaseModule):
-    """CenterHead for CenterPoint.
-
-    Args:
-        in_channels (list[int] | int, optional): Channels of the input
-            feature map. Default: [128].
-        tasks (list[dict], optional): Task information including class number
-            and class names. Default: None.
-        train_cfg (dict, optional): Train-time configs. Default: None.
-        test_cfg (dict, optional): Test-time configs. Default: None.
-        bbox_coder (dict, optional): Bbox coder configs. Default: None.
-        common_heads (dict, optional): Conv information for common heads.
-            Default: dict().
-        loss_cls (dict, optional): Config of classification loss function.
-            Default: dict(type='GaussianFocalLoss', reduction='mean').
-        loss_bbox (dict, optional): Config of regression loss function.
-            Default: dict(type='L1Loss', reduction='none').
-        separate_head (dict, optional): Config of separate head. Default: dict(
-            type='SeparateHead', init_bias=-2.19, final_kernel=3)
-        share_conv_channel (int, optional): Output channels for share_conv
-            layer. Default: 64.
-        num_heatmap_convs (int, optional): Number of conv layers for heatmap
-            conv layer. Default: 2.
-        conv_cfg (dict, optional): Config of conv layer.
-            Default: dict(type='Conv2d')
-        norm_cfg (dict, optional): Config of norm layer.
-            Default: dict(type='BN2d').
-        bias (str, optional): Type of bias. Default: 'auto'.
-    """
-
-    def __init__(self,
-                 in_channels=[128],
-                 tasks=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 bbox_coder=None,
-                 common_heads=dict(),
-                 loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
-                 loss_bbox=dict(
-                     type='L1Loss', reduction='none', loss_weight=0.25),
-                 separate_head=dict(
-                     type='SeparateHead', init_bias=-2.19, final_kernel=3),
-                 share_conv_channel=64,
-                 num_heatmap_convs=2,
-                 conv_cfg=dict(type='Conv2d'),
-                 norm_cfg=dict(type='BN2d'),
-                 bias='auto',
-                 norm_bbox=True,
-                 init_cfg=None):
-        assert init_cfg is None, 'To prevent abnormal initialization ' \
-            'behavior, init_cfg is not allowed to be set'
-        super(CenterHead, self).__init__(init_cfg=init_cfg)
-
-        num_classes = [len(t['class_names']) for t in tasks]
-        self.class_names = [t['class_names'] for t in tasks]
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        self.in_channels = in_channels
-        self.num_classes = num_classes
-        self.norm_bbox = norm_bbox
-
-        self.loss_cls = build_loss(loss_cls)
-        self.loss_bbox = build_loss(loss_bbox)
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-        self.num_anchor_per_locs = [n for n in num_classes]
-        self.fp16_enabled = False
-
-        # a shared convolution
-        self.shared_conv = ConvModule(
-            in_channels,
-            share_conv_channel,
-            kernel_size=3,
-            padding=1,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            bias=bias)
-
-        self.task_heads = nn.ModuleList()
-
-        for num_cls in num_classes:
-            heads = copy.deepcopy(common_heads)
-            heads.update(dict(heatmap=(num_cls, num_heatmap_convs)))
-            separate_head.update(
-                in_channels=share_conv_channel, heads=heads, num_cls=num_cls)
-            self.task_heads.append(builder.build_head(separate_head))
-
-    def forward_single(self, x):
-        """Forward function for CenterPoint.
-
-        Args:
-            x (torch.Tensor): Input feature map with the shape of
-                [B, 512, 128, 128].
-
-        Returns:
-            list[dict]: Output results for tasks.
-        """
-        ret_dicts = []
-
-        x = self.shared_conv(x)
-
-        for task in self.task_heads:
-            ret_dicts.append(task(x))
-
-        return ret_dicts
-
-    def forward(self, feats):
-        """Forward pass.
-
-        Args:
-            feats (list[torch.Tensor]): Multi-level features, e.g.,
-                features produced by FPN.
-
-        Returns:
-            tuple(list[dict]): Output results for tasks.
-        """
-        return multi_apply(self.forward_single, feats)
-
-    def _gather_feat(self, feat, ind, mask=None):
-        """Gather feature map.
-
-        Given feature map and index, return indexed feature map.
-
-        Args:
-            feat (torch.tensor): Feature map with the shape of [B, H*W, 10].
-            ind (torch.Tensor): Index of the ground truth boxes with the
-                shape of [B, max_obj].
-            mask (torch.Tensor, optional): Mask of the feature map with the
-                shape of [B, max_obj]. Default: None.
-
-        Returns:
-            torch.Tensor: Feature map after gathering with the shape
-                of [B, max_obj, 10].
-        """
-        dim = feat.size(2)
-        ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
-        feat = feat.gather(1, ind)
-        if mask is not None:
-            mask = mask.unsqueeze(2).expand_as(feat)
-            feat = feat[mask]
-            feat = feat.view(-1, dim)
-        return feat
-
-    def get_targets(self, gt_bboxes_3d, gt_labels_3d):
-        """Generate targets.
-
-        How each output is transformed:
-
-            Each nested list is transposed so that all same-index elements in
-            each sub-list (1, ..., N) become the new sub-lists.
-                [ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ]
-                ==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ]
-
-            The new transposed nested list is converted into a list of N
-            tensors generated by concatenating tensors in the new sub-lists.
-                [ tensor0, tensor1, tensor2, ... ]
-
-        Args:
-            gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
-                truth gt boxes.
-            gt_labels_3d (list[torch.Tensor]): Labels of boxes.
-
-        Returns:
-            Returns:
-                tuple[list[torch.Tensor]]: Tuple of target including
-                    the following results in order.
-
-                    - list[torch.Tensor]: Heatmap scores.
-                    - list[torch.Tensor]: Ground truth boxes.
-                    - list[torch.Tensor]: Indexes indicating the
-                        position of the valid boxes.
-                    - list[torch.Tensor]: Masks indicating which
-                        boxes are valid.
-        """
-        heatmaps, anno_boxes, inds, masks = multi_apply(
-            self.get_targets_single, gt_bboxes_3d, gt_labels_3d)
-        # Transpose heatmaps
-        heatmaps = list(map(list, zip(*heatmaps)))
-        heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
-        # Transpose anno_boxes
-        anno_boxes = list(map(list, zip(*anno_boxes)))
-        anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
-        # Transpose inds
-        inds = list(map(list, zip(*inds)))
-        inds = [torch.stack(inds_) for inds_ in inds]
-        # Transpose inds
-        masks = list(map(list, zip(*masks)))
-        masks = [torch.stack(masks_) for masks_ in masks]
-        return heatmaps, anno_boxes, inds, masks
-
-    def get_targets_single(self, gt_bboxes_3d, gt_labels_3d):
-        """Generate training targets for a single sample.
-
-        Args:
-            gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
-            gt_labels_3d (torch.Tensor): Labels of boxes.
-
-        Returns:
-            tuple[list[torch.Tensor]]: Tuple of target including
-                the following results in order.
-
-                - list[torch.Tensor]: Heatmap scores.
-                - list[torch.Tensor]: Ground truth boxes.
-                - list[torch.Tensor]: Indexes indicating the position
-                    of the valid boxes.
-                - list[torch.Tensor]: Masks indicating which boxes
-                    are valid.
-        """
-        device = gt_labels_3d.device
-        gt_bboxes_3d = torch.cat(
-            (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
-            dim=1).to(device)
-        max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
-        grid_size = torch.tensor(self.train_cfg['grid_size'])
-        pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
-        voxel_size = torch.tensor(self.train_cfg['voxel_size'])
-
-        feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
-
-        # reorganize the gt_dict by tasks
-        task_masks = []
-        flag = 0
-        for class_name in self.class_names:
-            task_masks.append([
-                torch.where(gt_labels_3d == class_name.index(i) + flag)
-                for i in class_name
-            ])
-            flag += len(class_name)
-
-        task_boxes = []
-        task_classes = []
-        flag2 = 0
-        for idx, mask in enumerate(task_masks):
-            task_box = []
-            task_class = []
-            for m in mask:
-                task_box.append(gt_bboxes_3d[m])
-                # 0 is background for each task, so we need to add 1 here.
-                task_class.append(gt_labels_3d[m] + 1 - flag2)
-            task_boxes.append(torch.cat(task_box, axis=0).to(device))
-            task_classes.append(torch.cat(task_class).long().to(device))
-            flag2 += len(mask)
-        draw_gaussian = draw_heatmap_gaussian
-        heatmaps, anno_boxes, inds, masks = [], [], [], []
-
-        for idx, task_head in enumerate(self.task_heads):
-            heatmap = gt_bboxes_3d.new_zeros(
-                (len(self.class_names[idx]), feature_map_size[1],
-                 feature_map_size[0]))
-
-            anno_box = gt_bboxes_3d.new_zeros((max_objs, 10),
-                                              dtype=torch.float32)
-
-            ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
-            mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)
-
-            num_objs = min(task_boxes[idx].shape[0], max_objs)
-
-            for k in range(num_objs):
-                cls_id = task_classes[idx][k] - 1
-
-                width = task_boxes[idx][k][3]
-                length = task_boxes[idx][k][4]
-                width = width / voxel_size[0] / self.train_cfg[
-                    'out_size_factor']
-                length = length / voxel_size[1] / self.train_cfg[
-                    'out_size_factor']
-
-                if width > 0 and length > 0:
-                    radius = gaussian_radius(
-                        (length, width),
-                        min_overlap=self.train_cfg['gaussian_overlap'])
-                    radius = max(self.train_cfg['min_radius'], int(radius))
-
-                    # be really careful for the coordinate system of
-                    # your box annotation.
-                    x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
-                        1], task_boxes[idx][k][2]
-
-                    coor_x = (
-                        x - pc_range[0]
-                    ) / voxel_size[0] / self.train_cfg['out_size_factor']
-                    coor_y = (
-                        y - pc_range[1]
-                    ) / voxel_size[1] / self.train_cfg['out_size_factor']
-
-                    center = torch.tensor([coor_x, coor_y],
-                                          dtype=torch.float32,
-                                          device=device)
-                    center_int = center.to(torch.int32)
-
-                    # throw out not in range objects to avoid out of array
-                    # area when creating the heatmap
-                    if not (0 <= center_int[0] < feature_map_size[0]
-                            and 0 <= center_int[1] < feature_map_size[1]):
-                        continue
-
-                    draw_gaussian(heatmap[cls_id], center_int, radius)
-
-                    new_idx = k
-                    x, y = center_int[0], center_int[1]
-
-                    assert (y * feature_map_size[0] + x <
-                            feature_map_size[0] * feature_map_size[1])
-
-                    ind[new_idx] = y * feature_map_size[0] + x
-                    mask[new_idx] = 1
-                    # TODO: support other outdoor dataset
-                    vx, vy = task_boxes[idx][k][7:]
-                    rot = task_boxes[idx][k][6]
-                    box_dim = task_boxes[idx][k][3:6]
-                    if self.norm_bbox:
-                        box_dim = box_dim.log()
-                    anno_box[new_idx] = torch.cat([
-                        center - torch.tensor([x, y], device=device),
-                        z.unsqueeze(0), box_dim,
-                        torch.sin(rot).unsqueeze(0),
-                        torch.cos(rot).unsqueeze(0),
-                        vx.unsqueeze(0),
-                        vy.unsqueeze(0)
-                    ])
-
-            heatmaps.append(heatmap)
-            anno_boxes.append(anno_box)
-            masks.append(mask)
-            inds.append(ind)
-        return heatmaps, anno_boxes, inds, masks
-
-    @force_fp32(apply_to=('preds_dicts'))
-    def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs):
-        """Loss function for CenterHead.
-
-        Args:
-            gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
-                truth gt boxes.
-            gt_labels_3d (list[torch.Tensor]): Labels of boxes.
-            preds_dicts (dict): Output of forward function.
-
-        Returns:
-            dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
-        """
-        heatmaps, anno_boxes, inds, masks = self.get_targets(
-            gt_bboxes_3d, gt_labels_3d)
-        loss_dict = dict()
-        for task_id, preds_dict in enumerate(preds_dicts):
-            # heatmap focal loss
-            preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
-            num_pos = heatmaps[task_id].eq(1).float().sum().item()
-            loss_heatmap = self.loss_cls(
-                preds_dict[0]['heatmap'],
-                heatmaps[task_id],
-                avg_factor=max(num_pos, 1))
-            target_box = anno_boxes[task_id]
-            # reconstruct the anno_box from multiple reg heads
-            preds_dict[0]['anno_box'] = torch.cat(
-                (preds_dict[0]['reg'], preds_dict[0]['height'],
-                 preds_dict[0]['dim'], preds_dict[0]['rot'],
-                 preds_dict[0]['vel']),
-                dim=1)
-
-            # Regression loss for dimension, offset, height, rotation
-            ind = inds[task_id]
-            num = masks[task_id].float().sum()
-            pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
-            pred = pred.view(pred.size(0), -1, pred.size(3))
-            pred = self._gather_feat(pred, ind)
-            mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
-            isnotnan = (~torch.isnan(target_box)).float()
-            mask *= isnotnan
-
-            code_weights = self.train_cfg.get('code_weights', None)
-            bbox_weights = mask * mask.new_tensor(code_weights)
-            loss_bbox = self.loss_bbox(
-                pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
-            loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
-            loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
-        return loss_dict
-
-    def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False):
-        """Generate bboxes from bbox head predictions.
-
-        Args:
-            preds_dicts (tuple[list[dict]]): Prediction results.
-            img_metas (list[dict]): Point cloud and image's meta info.
-
-        Returns:
-            list[dict]: Decoded bbox, scores and labels after nms.
-        """
-        rets = []
-        for task_id, preds_dict in enumerate(preds_dicts):
-            num_class_with_bg = self.num_classes[task_id]
-            batch_size = preds_dict[0]['heatmap'].shape[0]
-            batch_heatmap = preds_dict[0]['heatmap'].sigmoid()
-
-            batch_reg = preds_dict[0]['reg']
-            batch_hei = preds_dict[0]['height']
-
-            if self.norm_bbox:
-                batch_dim = torch.exp(preds_dict[0]['dim'])
-            else:
-                batch_dim = preds_dict[0]['dim']
-
-            batch_rots = preds_dict[0]['rot'][:, 0].unsqueeze(1)
-            batch_rotc = preds_dict[0]['rot'][:, 1].unsqueeze(1)
-
-            if 'vel' in preds_dict[0]:
-                batch_vel = preds_dict[0]['vel']
-            else:
-                batch_vel = None
-            temp = self.bbox_coder.decode(
-                batch_heatmap,
-                batch_rots,
-                batch_rotc,
-                batch_hei,
-                batch_dim,
-                batch_vel,
-                reg=batch_reg,
-                task_id=task_id)
-            assert self.test_cfg['nms_type'] in ['circle', 'rotate']
-            batch_reg_preds = [box['bboxes'] for box in temp]
-            batch_cls_preds = [box['scores'] for box in temp]
-            batch_cls_labels = [box['labels'] for box in temp]
-            if self.test_cfg['nms_type'] == 'circle':
-                ret_task = []
-                for i in range(batch_size):
-                    boxes3d = temp[i]['bboxes']
-                    scores = temp[i]['scores']
-                    labels = temp[i]['labels']
-                    centers = boxes3d[:, [0, 1]]
-                    boxes = torch.cat([centers, scores.view(-1, 1)], dim=1)
-                    keep = torch.tensor(
-                        circle_nms(
-                            boxes.detach().cpu().numpy(),
-                            self.test_cfg['min_radius'][task_id],
-                            post_max_size=self.test_cfg['post_max_size']),
-                        dtype=torch.long,
-                        device=boxes.device)
-
-                    boxes3d = boxes3d[keep]
-                    scores = scores[keep]
-                    labels = labels[keep]
-                    ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
-                    ret_task.append(ret)
-                rets.append(ret_task)
-            else:
-                rets.append(
-                    self.get_task_detections(num_class_with_bg,
-                                             batch_cls_preds, batch_reg_preds,
-                                             batch_cls_labels, img_metas))
-
-        # Merge branches results
-        num_samples = len(rets[0])
-
-        ret_list = []
-        for i in range(num_samples):
-            for k in rets[0][i].keys():
-                if k == 'bboxes':
-                    bboxes = torch.cat([ret[i][k] for ret in rets])
-                    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
-                    bboxes = img_metas[i]['box_type_3d'](
-                        bboxes, self.bbox_coder.code_size)
-                elif k == 'scores':
-                    scores = torch.cat([ret[i][k] for ret in rets])
-                elif k == 'labels':
-                    flag = 0
-                    for j, num_class in enumerate(self.num_classes):
-                        rets[j][i][k] += flag
-                        flag += num_class
-                    labels = torch.cat([ret[i][k].int() for ret in rets])
-            ret_list.append([bboxes, scores, labels])
-        return ret_list
-
-    def get_task_detections(self, num_class_with_bg, batch_cls_preds,
-                            batch_reg_preds, batch_cls_labels, img_metas):
-        """Rotate nms for each task.
-
-        Args:
-            num_class_with_bg (int): Number of classes for the current task.
-            batch_cls_preds (list[torch.Tensor]): Prediction score with the
-                shape of [N].
-            batch_reg_preds (list[torch.Tensor]): Prediction bbox with the
-                shape of [N, 9].
-            batch_cls_labels (list[torch.Tensor]): Prediction label with the
-                shape of [N].
-            img_metas (list[dict]): Meta information of each sample.
-
-        Returns:
-            list[dict[str: torch.Tensor]]: contains the following keys:
-
-                -bboxes (torch.Tensor): Prediction bboxes after nms with the
-                    shape of [N, 9].
-                -scores (torch.Tensor): Prediction scores after nms with the
-                    shape of [N].
-                -labels (torch.Tensor): Prediction labels after nms with the
-                    shape of [N].
-        """
-        predictions_dicts = []
-        post_center_range = self.test_cfg['post_center_limit_range']
-        if len(post_center_range) > 0:
-            post_center_range = torch.tensor(
-                post_center_range,
-                dtype=batch_reg_preds[0].dtype,
-                device=batch_reg_preds[0].device)
-
-        for i, (box_preds, cls_preds, cls_labels) in enumerate(
-                zip(batch_reg_preds, batch_cls_preds, batch_cls_labels)):
-
-            # Apply NMS in bird eye view
-
-            # get the highest score per prediction, then apply nms
-            # to remove overlapped box.
-            if num_class_with_bg == 1:
-                top_scores = cls_preds.squeeze(-1)
-                top_labels = torch.zeros(
-                    cls_preds.shape[0],
-                    device=cls_preds.device,
-                    dtype=torch.long)
-
-            else:
-                top_labels = cls_labels.long()
-                top_scores = cls_preds.squeeze(-1)
-
-            if self.test_cfg['score_threshold'] > 0.0:
-                thresh = torch.tensor(
-                    [self.test_cfg['score_threshold']],
-                    device=cls_preds.device).type_as(cls_preds)
-                top_scores_keep = top_scores >= thresh
-                top_scores = top_scores.masked_select(top_scores_keep)
-
-            if top_scores.shape[0] != 0:
-                if self.test_cfg['score_threshold'] > 0.0:
-                    box_preds = box_preds[top_scores_keep]
-                    top_labels = top_labels[top_scores_keep]
-
-                boxes_for_nms = xywhr2xyxyr(img_metas[i]['box_type_3d'](
-                    box_preds[:, :], self.bbox_coder.code_size).bev)
-                # the nms in 3d detection just remove overlap boxes.
-
-                selected = nms_bev(
-                    boxes_for_nms,
-                    top_scores,
-                    thresh=self.test_cfg['nms_thr'],
-                    pre_max_size=self.test_cfg['pre_max_size'],
-                    post_max_size=self.test_cfg['post_max_size'])
-            else:
-                selected = []
-
-            # if selected is not None:
-            selected_boxes = box_preds[selected]
-            selected_labels = top_labels[selected]
-            selected_scores = top_scores[selected]
-
-            # finally generate predictions.
-            if selected_boxes.shape[0] != 0:
-                box_preds = selected_boxes
-                scores = selected_scores
-                label_preds = selected_labels
-                final_box_preds = box_preds
-                final_scores = scores
-                final_labels = label_preds
-                if post_center_range is not None:
-                    mask = (final_box_preds[:, :3] >=
-                            post_center_range[:3]).all(1)
-                    mask &= (final_box_preds[:, :3] <=
-                             post_center_range[3:]).all(1)
-                    predictions_dict = dict(
-                        bboxes=final_box_preds[mask],
-                        scores=final_scores[mask],
-                        labels=final_labels[mask])
-                else:
-                    predictions_dict = dict(
-                        bboxes=final_box_preds,
-                        scores=final_scores,
-                        labels=final_labels)
-            else:
-                dtype = batch_reg_preds[0].dtype
-                device = batch_reg_preds[0].device
-                predictions_dict = dict(
-                    bboxes=torch.zeros([0, self.bbox_coder.code_size],
-                                       dtype=dtype,
-                                       device=device),
-                    scores=torch.zeros([0], dtype=dtype, device=device),
-                    labels=torch.zeros([0],
-                                       dtype=top_labels.dtype,
-                                       device=device))
-
-            predictions_dicts.append(predictions_dict)
-        return predictions_dicts
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+from mmcv.cnn import ConvModule, build_conv_layer
+from mmcv.runner import BaseModule, force_fp32
+from torch import nn
+
+from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius,
+                          xywhr2xyxyr)
+from mmdet3d.core.post_processing import nms_bev
+from mmdet3d.models import builder
+from mmdet3d.models.utils import clip_sigmoid
+from mmdet.core import build_bbox_coder, multi_apply
+from ..builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class SeparateHead(BaseModule):
+    """SeparateHead for CenterHead.
+
+    Args:
+        in_channels (int): Input channels for conv_layer.
+        heads (dict): Conv information.
+        head_conv (int, optional): Output channels.
+            Default: 64.
+        final_kernel (int, optional): Kernel size for the last conv layer.
+            Default: 1.
+        init_bias (float, optional): Initial bias. Default: -2.19.
+        conv_cfg (dict, optional): Config of conv layer.
+            Default: dict(type='Conv2d')
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='BN2d').
+        bias (str, optional): Type of bias. Default: 'auto'.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 heads,
+                 head_conv=64,
+                 final_kernel=1,
+                 init_bias=-2.19,
+                 conv_cfg=dict(type='Conv2d'),
+                 norm_cfg=dict(type='BN2d'),
+                 bias='auto',
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+            'behavior, init_cfg is not allowed to be set'
+        super(SeparateHead, self).__init__(init_cfg=init_cfg)
+        self.heads = heads
+        self.init_bias = init_bias
+        for head in self.heads:
+            classes, num_conv = self.heads[head]
+
+            conv_layers = []
+            c_in = in_channels
+            for i in range(num_conv - 1):
+                conv_layers.append(
+                    ConvModule(
+                        c_in,
+                        head_conv,
+                        kernel_size=final_kernel,
+                        stride=1,
+                        padding=final_kernel // 2,
+                        bias=bias,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg))
+                c_in = head_conv
+
+            conv_layers.append(
+                build_conv_layer(
+                    conv_cfg,
+                    head_conv,
+                    classes,
+                    kernel_size=final_kernel,
+                    stride=1,
+                    padding=final_kernel // 2,
+                    bias=True))
+            conv_layers = nn.Sequential(*conv_layers)
+
+            self.__setattr__(head, conv_layers)
+
+            if init_cfg is None:
+                self.init_cfg = dict(type='Kaiming', layer='Conv2d')
+
+    def init_weights(self):
+        """Initialize weights."""
+        super().init_weights()
+        for head in self.heads:
+            if head == 'heatmap':
+                self.__getattr__(head)[-1].bias.data.fill_(self.init_bias)
+
+    def forward(self, x):
+        """Forward function for SepHead.
+
+        Args:
+            x (torch.Tensor): Input feature map with the shape of
+                [B, 512, 128, 128].
+
+        Returns:
+            dict[str: torch.Tensor]: contains the following keys:
+
+                -reg （torch.Tensor): 2D regression value with the
+                    shape of [B, 2, H, W].
+                -height (torch.Tensor): Height value with the
+                    shape of [B, 1, H, W].
+                -dim (torch.Tensor): Size value with the shape
+                    of [B, 3, H, W].
+                -rot (torch.Tensor): Rotation value with the
+                    shape of [B, 2, H, W].
+                -vel (torch.Tensor): Velocity value with the
+                    shape of [B, 2, H, W].
+                -heatmap (torch.Tensor): Heatmap with the shape of
+                    [B, N, H, W].
+        """
+        ret_dict = dict()
+        for head in self.heads:
+            ret_dict[head] = self.__getattr__(head)(x)
+
+        return ret_dict
+
+
+@HEADS.register_module()
+class DCNSeparateHead(BaseModule):
+    r"""DCNSeparateHead for CenterHead.
+
+    .. code-block:: none
+            /-----> DCN for heatmap task -----> heatmap task.
+    feature
+            \-----> DCN for regression tasks -----> regression tasks
+
+    Args:
+        in_channels (int): Input channels for conv_layer.
+        num_cls (int): Number of classes.
+        heads (dict): Conv information.
+        dcn_config (dict): Config of dcn layer.
+        head_conv (int, optional): Output channels.
+            Default: 64.
+        final_kernel (int, optional): Kernel size for the last conv
+            layer. Default: 1.
+        init_bias (float, optional): Initial bias. Default: -2.19.
+        conv_cfg (dict, optional): Config of conv layer.
+            Default: dict(type='Conv2d')
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='BN2d').
+        bias (str, optional): Type of bias. Default: 'auto'.
+    """  # noqa: W605
+
+    def __init__(self,
+                 in_channels,
+                 num_cls,
+                 heads,
+                 dcn_config,
+                 head_conv=64,
+                 final_kernel=1,
+                 init_bias=-2.19,
+                 conv_cfg=dict(type='Conv2d'),
+                 norm_cfg=dict(type='BN2d'),
+                 bias='auto',
+                 init_cfg=None,
+                 **kwargs):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+            'behavior, init_cfg is not allowed to be set'
+        super(DCNSeparateHead, self).__init__(init_cfg=init_cfg)
+        if 'heatmap' in heads:
+            heads.pop('heatmap')
+        # feature adaptation with dcn
+        # use separate features for classification / regression
+        self.feature_adapt_cls = build_conv_layer(dcn_config)
+
+        self.feature_adapt_reg = build_conv_layer(dcn_config)
+
+        # heatmap prediction head
+        cls_head = [
+            ConvModule(
+                in_channels,
+                head_conv,
+                kernel_size=3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                bias=bias,
+                norm_cfg=norm_cfg),
+            build_conv_layer(
+                conv_cfg,
+                head_conv,
+                num_cls,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=bias)
+        ]
+        self.cls_head = nn.Sequential(*cls_head)
+        self.init_bias = init_bias
+        # other regression target
+        self.task_head = SeparateHead(
+            in_channels,
+            heads,
+            head_conv=head_conv,
+            final_kernel=final_kernel,
+            bias=bias)
+        if init_cfg is None:
+            self.init_cfg = dict(type='Kaiming', layer='Conv2d')
+
+    def init_weights(self):
+        """Initialize weights."""
+        super().init_weights()
+        self.cls_head[-1].bias.data.fill_(self.init_bias)
+
+    def forward(self, x):
+        """Forward function for DCNSepHead.
+
+        Args:
+            x (torch.Tensor): Input feature map with the shape of
+                [B, 512, 128, 128].
+
+        Returns:
+            dict[str: torch.Tensor]: contains the following keys:
+
+                -reg （torch.Tensor): 2D regression value with the
+                    shape of [B, 2, H, W].
+                -height (torch.Tensor): Height value with the
+                    shape of [B, 1, H, W].
+                -dim (torch.Tensor): Size value with the shape
+                    of [B, 3, H, W].
+                -rot (torch.Tensor): Rotation value with the
+                    shape of [B, 2, H, W].
+                -vel (torch.Tensor): Velocity value with the
+                    shape of [B, 2, H, W].
+                -heatmap (torch.Tensor): Heatmap with the shape of
+                    [B, N, H, W].
+        """
+        center_feat = self.feature_adapt_cls(x)
+        reg_feat = self.feature_adapt_reg(x)
+
+        cls_score = self.cls_head(center_feat)
+        ret = self.task_head(reg_feat)
+        ret['heatmap'] = cls_score
+
+        return ret
+
+
+@HEADS.register_module()
+class CenterHead(BaseModule):
+    """CenterHead for CenterPoint.
+
+    Args:
+        in_channels (list[int] | int, optional): Channels of the input
+            feature map. Default: [128].
+        tasks (list[dict], optional): Task information including class number
+            and class names. Default: None.
+        train_cfg (dict, optional): Train-time configs. Default: None.
+        test_cfg (dict, optional): Test-time configs. Default: None.
+        bbox_coder (dict, optional): Bbox coder configs. Default: None.
+        common_heads (dict, optional): Conv information for common heads.
+            Default: dict().
+        loss_cls (dict, optional): Config of classification loss function.
+            Default: dict(type='GaussianFocalLoss', reduction='mean').
+        loss_bbox (dict, optional): Config of regression loss function.
+            Default: dict(type='L1Loss', reduction='none').
+        separate_head (dict, optional): Config of separate head. Default: dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3)
+        share_conv_channel (int, optional): Output channels for share_conv
+            layer. Default: 64.
+        num_heatmap_convs (int, optional): Number of conv layers for heatmap
+            conv layer. Default: 2.
+        conv_cfg (dict, optional): Config of conv layer.
+            Default: dict(type='Conv2d')
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='BN2d').
+        bias (str, optional): Type of bias. Default: 'auto'.
+    """
+
+    def __init__(self,
+                 in_channels=[128],
+                 tasks=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 bbox_coder=None,
+                 common_heads=dict(),
+                 loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+                 loss_bbox=dict(
+                     type='L1Loss', reduction='none', loss_weight=0.25),
+                 separate_head=dict(
+                     type='SeparateHead', init_bias=-2.19, final_kernel=3),
+                 share_conv_channel=64,
+                 num_heatmap_convs=2,
+                 conv_cfg=dict(type='Conv2d'),
+                 norm_cfg=dict(type='BN2d'),
+                 bias='auto',
+                 norm_bbox=True,
+                 init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+            'behavior, init_cfg is not allowed to be set'
+        super(CenterHead, self).__init__(init_cfg=init_cfg)
+
+        num_classes = [len(t['class_names']) for t in tasks]
+        self.class_names = [t['class_names'] for t in tasks]
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.norm_bbox = norm_bbox
+
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.num_anchor_per_locs = [n for n in num_classes]
+        self.fp16_enabled = False
+
+        # a shared convolution
+        self.shared_conv = ConvModule(
+            in_channels,
+            share_conv_channel,
+            kernel_size=3,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=bias)
+
+        self.task_heads = nn.ModuleList()
+
+        for num_cls in num_classes:
+            heads = copy.deepcopy(common_heads)
+            heads.update(dict(heatmap=(num_cls, num_heatmap_convs)))
+            separate_head.update(
+                in_channels=share_conv_channel, heads=heads, num_cls=num_cls)
+            self.task_heads.append(builder.build_head(separate_head))
+
+    def forward_single(self, x):
+        """Forward function for CenterPoint.
+
+        Args:
+            x (torch.Tensor): Input feature map with the shape of
+                [B, 512, 128, 128].
+
+        Returns:
+            list[dict]: Output results for tasks.
+        """
+        ret_dicts = []
+
+        x = self.shared_conv(x)
+
+        for task in self.task_heads:
+            ret_dicts.append(task(x))
+
+        return ret_dicts
+
+    def forward(self, feats):
+        """Forward pass.
+
+        Args:
+            feats (list[torch.Tensor]): Multi-level features, e.g.,
+                features produced by FPN.
+
+        Returns:
+            tuple(list[dict]): Output results for tasks.
+        """
+        return multi_apply(self.forward_single, feats)
+
+    def _gather_feat(self, feat, ind, mask=None):
+        """Gather feature map.
+
+        Given feature map and index, return indexed feature map.
+
+        Args:
+            feat (torch.tensor): Feature map with the shape of [B, H*W, 10].
+            ind (torch.Tensor): Index of the ground truth boxes with the
+                shape of [B, max_obj].
+            mask (torch.Tensor, optional): Mask of the feature map with the
+                shape of [B, max_obj]. Default: None.
+
+        Returns:
+            torch.Tensor: Feature map after gathering with the shape
+                of [B, max_obj, 10].
+        """
+        dim = feat.size(2)
+        ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
+        feat = feat.gather(1, ind)
+        if mask is not None:
+            mask = mask.unsqueeze(2).expand_as(feat)
+            feat = feat[mask]
+            feat = feat.view(-1, dim)
+        return feat
+
+    def get_targets(self, gt_bboxes_3d, gt_labels_3d):
+        """Generate targets.
+
+        How each output is transformed:
+
+            Each nested list is transposed so that all same-index elements in
+            each sub-list (1, ..., N) become the new sub-lists.
+                [ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ]
+                ==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ]
+
+            The new transposed nested list is converted into a list of N
+            tensors generated by concatenating tensors in the new sub-lists.
+                [ tensor0, tensor1, tensor2, ... ]
+
+        Args:
+            gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
+                truth gt boxes.
+            gt_labels_3d (list[torch.Tensor]): Labels of boxes.
+
+        Returns:
+            Returns:
+                tuple[list[torch.Tensor]]: Tuple of target including
+                    the following results in order.
+
+                    - list[torch.Tensor]: Heatmap scores.
+                    - list[torch.Tensor]: Ground truth boxes.
+                    - list[torch.Tensor]: Indexes indicating the
+                        position of the valid boxes.
+                    - list[torch.Tensor]: Masks indicating which
+                        boxes are valid.
+        """
+        heatmaps, anno_boxes, inds, masks = multi_apply(
+            self.get_targets_single, gt_bboxes_3d, gt_labels_3d)
+        # Transpose heatmaps
+        heatmaps = list(map(list, zip(*heatmaps)))
+        heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
+        # Transpose anno_boxes
+        anno_boxes = list(map(list, zip(*anno_boxes)))
+        anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
+        # Transpose inds
+        inds = list(map(list, zip(*inds)))
+        inds = [torch.stack(inds_) for inds_ in inds]
+        # Transpose inds
+        masks = list(map(list, zip(*masks)))
+        masks = [torch.stack(masks_) for masks_ in masks]
+        return heatmaps, anno_boxes, inds, masks
+
+    def get_targets_single(self, gt_bboxes_3d, gt_labels_3d):
+        """Generate training targets for a single sample.
+
+        Args:
+            gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
+            gt_labels_3d (torch.Tensor): Labels of boxes.
+
+        Returns:
+            tuple[list[torch.Tensor]]: Tuple of target including
+                the following results in order.
+
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the position
+                    of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which boxes
+                    are valid.
+        """
+        device = gt_labels_3d.device
+        gt_bboxes_3d = torch.cat(
+            (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
+            dim=1).to(device)
+        max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
+        grid_size = torch.tensor(self.train_cfg['grid_size'])
+        pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
+        voxel_size = torch.tensor(self.train_cfg['voxel_size'])
+
+        feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
+
+        # reorganize the gt_dict by tasks
+        task_masks = []
+        flag = 0
+        for class_name in self.class_names:
+            task_masks.append([
+                torch.where(gt_labels_3d == class_name.index(i) + flag)
+                for i in class_name
+            ])
+            flag += len(class_name)
+
+        task_boxes = []
+        task_classes = []
+        flag2 = 0
+        for idx, mask in enumerate(task_masks):
+            task_box = []
+            task_class = []
+            for m in mask:
+                task_box.append(gt_bboxes_3d[m])
+                # 0 is background for each task, so we need to add 1 here.
+                task_class.append(gt_labels_3d[m] + 1 - flag2)
+            task_boxes.append(torch.cat(task_box, axis=0).to(device))
+            task_classes.append(torch.cat(task_class).long().to(device))
+            flag2 += len(mask)
+        draw_gaussian = draw_heatmap_gaussian
+        heatmaps, anno_boxes, inds, masks = [], [], [], []
+
+        for idx, task_head in enumerate(self.task_heads):
+            heatmap = gt_bboxes_3d.new_zeros(
+                (len(self.class_names[idx]), feature_map_size[1],
+                 feature_map_size[0]))
+
+            anno_box = gt_bboxes_3d.new_zeros((max_objs, 10),
+                                              dtype=torch.float32)
+
+            ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
+            mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)
+
+            num_objs = min(task_boxes[idx].shape[0], max_objs)
+
+            for k in range(num_objs):
+                cls_id = task_classes[idx][k] - 1
+
+                width = task_boxes[idx][k][3]
+                length = task_boxes[idx][k][4]
+                width = width / voxel_size[0] / self.train_cfg[
+                    'out_size_factor']
+                length = length / voxel_size[1] / self.train_cfg[
+                    'out_size_factor']
+
+                if width > 0 and length > 0:
+                    radius = gaussian_radius(
+                        (length, width),
+                        min_overlap=self.train_cfg['gaussian_overlap'])
+                    radius = max(self.train_cfg['min_radius'], int(radius))
+
+                    # be really careful for the coordinate system of
+                    # your box annotation.
+                    x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
+                        1], task_boxes[idx][k][2]
+
+                    coor_x = (
+                        x - pc_range[0]
+                    ) / voxel_size[0] / self.train_cfg['out_size_factor']
+                    coor_y = (
+                        y - pc_range[1]
+                    ) / voxel_size[1] / self.train_cfg['out_size_factor']
+
+                    center = torch.tensor([coor_x, coor_y],
+                                          dtype=torch.float32,
+                                          device=device)
+                    center_int = center.to(torch.int32)
+
+                    # throw out not in range objects to avoid out of array
+                    # area when creating the heatmap
+                    if not (0 <= center_int[0] < feature_map_size[0]
+                            and 0 <= center_int[1] < feature_map_size[1]):
+                        continue
+
+                    draw_gaussian(heatmap[cls_id], center_int, radius)
+
+                    new_idx = k
+                    x, y = center_int[0], center_int[1]
+
+                    assert (y * feature_map_size[0] + x <
+                            feature_map_size[0] * feature_map_size[1])
+
+                    ind[new_idx] = y * feature_map_size[0] + x
+                    mask[new_idx] = 1
+                    # TODO: support other outdoor dataset
+                    vx, vy = task_boxes[idx][k][7:]
+                    rot = task_boxes[idx][k][6]
+                    box_dim = task_boxes[idx][k][3:6]
+                    if self.norm_bbox:
+                        box_dim = box_dim.log()
+                    anno_box[new_idx] = torch.cat([
+                        center - torch.tensor([x, y], device=device),
+                        z.unsqueeze(0), box_dim,
+                        torch.sin(rot).unsqueeze(0),
+                        torch.cos(rot).unsqueeze(0),
+                        vx.unsqueeze(0),
+                        vy.unsqueeze(0)
+                    ])
+
+            heatmaps.append(heatmap)
+            anno_boxes.append(anno_box)
+            masks.append(mask)
+            inds.append(ind)
+        return heatmaps, anno_boxes, inds, masks
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs):
+        """Loss function for CenterHead.
+
+        Args:
+            gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
+                truth gt boxes.
+            gt_labels_3d (list[torch.Tensor]): Labels of boxes.
+            preds_dicts (dict): Output of forward function.
+
+        Returns:
+            dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
+        """
+        heatmaps, anno_boxes, inds, masks = self.get_targets(
+            gt_bboxes_3d, gt_labels_3d)
+        loss_dict = dict()
+        for task_id, preds_dict in enumerate(preds_dicts):
+            # heatmap focal loss
+            preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
+            num_pos = heatmaps[task_id].eq(1).float().sum().item()
+            loss_heatmap = self.loss_cls(
+                preds_dict[0]['heatmap'],
+                heatmaps[task_id],
+                avg_factor=max(num_pos, 1))
+            target_box = anno_boxes[task_id]
+            # reconstruct the anno_box from multiple reg heads
+            preds_dict[0]['anno_box'] = torch.cat(
+                (preds_dict[0]['reg'], preds_dict[0]['height'],
+                 preds_dict[0]['dim'], preds_dict[0]['rot'],
+                 preds_dict[0]['vel']),
+                dim=1)
+
+            # Regression loss for dimension, offset, height, rotation
+            ind = inds[task_id]
+            num = masks[task_id].float().sum()
+            pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
+            pred = pred.view(pred.size(0), -1, pred.size(3))
+            pred = self._gather_feat(pred, ind)
+            mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
+            isnotnan = (~torch.isnan(target_box)).float()
+            mask *= isnotnan
+
+            code_weights = self.train_cfg.get('code_weights', None)
+            bbox_weights = mask * mask.new_tensor(code_weights)
+            loss_bbox = self.loss_bbox(
+                pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
+            loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
+            loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
+        return loss_dict
+
+    def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False):
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+            img_metas (list[dict]): Point cloud and image's meta info.
+
+        Returns:
+            list[dict]: Decoded bbox, scores and labels after nms.
+        """
+        rets = []
+        for task_id, preds_dict in enumerate(preds_dicts):
+            num_class_with_bg = self.num_classes[task_id]
+            batch_size = preds_dict[0]['heatmap'].shape[0]
+            batch_heatmap = preds_dict[0]['heatmap'].sigmoid()
+
+            batch_reg = preds_dict[0]['reg']
+            batch_hei = preds_dict[0]['height']
+
+            if self.norm_bbox:
+                batch_dim = torch.exp(preds_dict[0]['dim'])
+            else:
+                batch_dim = preds_dict[0]['dim']
+
+            batch_rots = preds_dict[0]['rot'][:, 0].unsqueeze(1)
+            batch_rotc = preds_dict[0]['rot'][:, 1].unsqueeze(1)
+
+            if 'vel' in preds_dict[0]:
+                batch_vel = preds_dict[0]['vel']
+            else:
+                batch_vel = None
+            temp = self.bbox_coder.decode(
+                batch_heatmap,
+                batch_rots,
+                batch_rotc,
+                batch_hei,
+                batch_dim,
+                batch_vel,
+                reg=batch_reg,
+                task_id=task_id)
+            assert self.test_cfg['nms_type'] in ['circle', 'rotate']
+            batch_reg_preds = [box['bboxes'] for box in temp]
+            batch_cls_preds = [box['scores'] for box in temp]
+            batch_cls_labels = [box['labels'] for box in temp]
+            if self.test_cfg['nms_type'] == 'circle':
+                ret_task = []
+                for i in range(batch_size):
+                    boxes3d = temp[i]['bboxes']
+                    scores = temp[i]['scores']
+                    labels = temp[i]['labels']
+                    centers = boxes3d[:, [0, 1]]
+                    boxes = torch.cat([centers, scores.view(-1, 1)], dim=1)
+                    keep = torch.tensor(
+                        circle_nms(
+                            boxes.detach().cpu().numpy(),
+                            self.test_cfg['min_radius'][task_id],
+                            post_max_size=self.test_cfg['post_max_size']),
+                        dtype=torch.long,
+                        device=boxes.device)
+
+                    boxes3d = boxes3d[keep]
+                    scores = scores[keep]
+                    labels = labels[keep]
+                    ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
+                    ret_task.append(ret)
+                rets.append(ret_task)
+            else:
+                rets.append(
+                    self.get_task_detections(num_class_with_bg,
+                                             batch_cls_preds, batch_reg_preds,
+                                             batch_cls_labels, img_metas))
+
+        # Merge branches results
+        num_samples = len(rets[0])
+
+        ret_list = []
+        for i in range(num_samples):
+            for k in rets[0][i].keys():
+                if k == 'bboxes':
+                    bboxes = torch.cat([ret[i][k] for ret in rets])
+                    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+                    bboxes = img_metas[i]['box_type_3d'](
+                        bboxes, self.bbox_coder.code_size)
+                elif k == 'scores':
+                    scores = torch.cat([ret[i][k] for ret in rets])
+                elif k == 'labels':
+                    flag = 0
+                    for j, num_class in enumerate(self.num_classes):
+                        rets[j][i][k] += flag
+                        flag += num_class
+                    labels = torch.cat([ret[i][k].int() for ret in rets])
+            ret_list.append([bboxes, scores, labels])
+        return ret_list
+
+    def get_task_detections(self, num_class_with_bg, batch_cls_preds,
+                            batch_reg_preds, batch_cls_labels, img_metas):
+        """Rotate nms for each task.
+
+        Args:
+            num_class_with_bg (int): Number of classes for the current task.
+            batch_cls_preds (list[torch.Tensor]): Prediction score with the
+                shape of [N].
+            batch_reg_preds (list[torch.Tensor]): Prediction bbox with the
+                shape of [N, 9].
+            batch_cls_labels (list[torch.Tensor]): Prediction label with the
+                shape of [N].
+            img_metas (list[dict]): Meta information of each sample.
+
+        Returns:
+            list[dict[str: torch.Tensor]]: contains the following keys:
+
+                -bboxes (torch.Tensor): Prediction bboxes after nms with the
+                    shape of [N, 9].
+                -scores (torch.Tensor): Prediction scores after nms with the
+                    shape of [N].
+                -labels (torch.Tensor): Prediction labels after nms with the
+                    shape of [N].
+        """
+        predictions_dicts = []
+        post_center_range = self.test_cfg['post_center_limit_range']
+        if len(post_center_range) > 0:
+            post_center_range = torch.tensor(
+                post_center_range,
+                dtype=batch_reg_preds[0].dtype,
+                device=batch_reg_preds[0].device)
+
+        for i, (box_preds, cls_preds, cls_labels) in enumerate(
+                zip(batch_reg_preds, batch_cls_preds, batch_cls_labels)):
+
+            # Apply NMS in bird eye view
+
+            # get the highest score per prediction, then apply nms
+            # to remove overlapped box.
+            if num_class_with_bg == 1:
+                top_scores = cls_preds.squeeze(-1)
+                top_labels = torch.zeros(
+                    cls_preds.shape[0],
+                    device=cls_preds.device,
+                    dtype=torch.long)
+
+            else:
+                top_labels = cls_labels.long()
+                top_scores = cls_preds.squeeze(-1)
+
+            if self.test_cfg['score_threshold'] > 0.0:
+                thresh = torch.tensor(
+                    [self.test_cfg['score_threshold']],
+                    device=cls_preds.device).type_as(cls_preds)
+                top_scores_keep = top_scores >= thresh
+                top_scores = top_scores.masked_select(top_scores_keep)
+
+            if top_scores.shape[0] != 0:
+                if self.test_cfg['score_threshold'] > 0.0:
+                    box_preds = box_preds[top_scores_keep]
+                    top_labels = top_labels[top_scores_keep]
+
+                boxes_for_nms = xywhr2xyxyr(img_metas[i]['box_type_3d'](
+                    box_preds[:, :], self.bbox_coder.code_size).bev)
+                # the nms in 3d detection just remove overlap boxes.
+
+                selected = nms_bev(
+                    boxes_for_nms,
+                    top_scores,
+                    thresh=self.test_cfg['nms_thr'],
+                    pre_max_size=self.test_cfg['pre_max_size'],
+                    post_max_size=self.test_cfg['post_max_size'])
+            else:
+                selected = []
+
+            # if selected is not None:
+            selected_boxes = box_preds[selected]
+            selected_labels = top_labels[selected]
+            selected_scores = top_scores[selected]
+
+            # finally generate predictions.
+            if selected_boxes.shape[0] != 0:
+                box_preds = selected_boxes
+                scores = selected_scores
+                label_preds = selected_labels
+                final_box_preds = box_preds
+                final_scores = scores
+                final_labels = label_preds
+                if post_center_range is not None:
+                    mask = (final_box_preds[:, :3] >=
+                            post_center_range[:3]).all(1)
+                    mask &= (final_box_preds[:, :3] <=
+                             post_center_range[3:]).all(1)
+                    predictions_dict = dict(
+                        bboxes=final_box_preds[mask],
+                        scores=final_scores[mask],
+                        labels=final_labels[mask])
+                else:
+                    predictions_dict = dict(
+                        bboxes=final_box_preds,
+                        scores=final_scores,
+                        labels=final_labels)
+            else:
+                dtype = batch_reg_preds[0].dtype
+                device = batch_reg_preds[0].device
+                predictions_dict = dict(
+                    bboxes=torch.zeros([0, self.bbox_coder.code_size],
+                                       dtype=dtype,
+                                       device=device),
+                    scores=torch.zeros([0], dtype=dtype, device=device),
+                    labels=torch.zeros([0],
+                                       dtype=top_labels.dtype,
+                                       device=device))
+
+            predictions_dicts.append(predictions_dict)
+        return predictions_dicts
diff --git a/mmdet3d/models/dense_heads/fcos_mono3d_head.py b/mmdet3d/models/dense_heads/fcos_mono3d_head.py
index d0aa29f..e09a916 100644
--- a/mmdet3d/models/dense_heads/fcos_mono3d_head.py
+++ b/mmdet3d/models/dense_heads/fcos_mono3d_head.py
@@ -1,956 +1,956 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from logging import warning
-
-import numpy as np
-import torch
-from mmcv.cnn import Scale, normal_init
-from mmcv.runner import force_fp32
-from torch import nn as nn
-
-from mmdet3d.core import (box3d_multiclass_nms, limit_period, points_img2cam,
-                          xywhr2xyxyr)
-from mmdet.core import multi_apply
-from mmdet.core.bbox.builder import build_bbox_coder
-from ..builder import HEADS, build_loss
-from .anchor_free_mono3d_head import AnchorFreeMono3DHead
-
-INF = 1e8
-
-
-@HEADS.register_module()
-class FCOSMono3DHead(AnchorFreeMono3DHead):
-    """Anchor-free head used in FCOS3D.
-
-    Args:
-        num_classes (int): Number of categories excluding the background
-            category.
-        in_channels (int): Number of channels in the input feature map.
-        regress_ranges (tuple[tuple[int, int]], optional): Regress range of multiple
-            level points.
-        center_sampling (bool, optional): If true, use center sampling. Default: True.
-        center_sample_radius (float, optional): Radius of center sampling. Default: 1.5.
-        norm_on_bbox (bool, optional): If true, normalize the regression targets
-            with FPN strides. Default: True.
-        centerness_on_reg (bool, optional): If true, position centerness on the
-            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
-            Default: True.
-        centerness_alpha (int, optional): Parameter used to adjust the intensity
-            attenuation from the center to the periphery. Default: 2.5.
-        loss_cls (dict, optional): Config of classification loss.
-        loss_bbox (dict, optional): Config of localization loss.
-        loss_dir (dict, optional): Config of direction classification loss.
-        loss_attr (dict, optional): Config of attribute classification loss.
-        loss_centerness (dict, optional): Config of centerness loss.
-        norm_cfg (dict, optional): dictionary to construct and config norm layer.
-            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
-        centerness_branch (tuple[int], optional): Channels for centerness branch.
-            Default: (64, ).
-    """  # noqa: E501
-
-    def __init__(self,
-                 regress_ranges=((-1, 48), (48, 96), (96, 192), (192, 384),
-                                 (384, INF)),
-                 center_sampling=True,
-                 center_sample_radius=1.5,
-                 norm_on_bbox=True,
-                 centerness_on_reg=True,
-                 centerness_alpha=2.5,
-                 loss_cls=dict(
-                     type='FocalLoss',
-                     use_sigmoid=True,
-                     gamma=2.0,
-                     alpha=0.25,
-                     loss_weight=1.0),
-                 loss_bbox=dict(
-                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
-                 loss_dir=dict(
-                     type='CrossEntropyLoss',
-                     use_sigmoid=False,
-                     loss_weight=1.0),
-                 loss_attr=dict(
-                     type='CrossEntropyLoss',
-                     use_sigmoid=False,
-                     loss_weight=1.0),
-                 loss_centerness=dict(
-                     type='CrossEntropyLoss',
-                     use_sigmoid=True,
-                     loss_weight=1.0),
-                 bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9),
-                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
-                 centerness_branch=(64, ),
-                 init_cfg=None,
-                 **kwargs):
-        self.regress_ranges = regress_ranges
-        self.center_sampling = center_sampling
-        self.center_sample_radius = center_sample_radius
-        self.norm_on_bbox = norm_on_bbox
-        self.centerness_on_reg = centerness_on_reg
-        self.centerness_alpha = centerness_alpha
-        self.centerness_branch = centerness_branch
-        super().__init__(
-            loss_cls=loss_cls,
-            loss_bbox=loss_bbox,
-            loss_dir=loss_dir,
-            loss_attr=loss_attr,
-            norm_cfg=norm_cfg,
-            init_cfg=init_cfg,
-            **kwargs)
-        self.loss_centerness = build_loss(loss_centerness)
-        bbox_coder['code_size'] = self.bbox_code_size
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-
-    def _init_layers(self):
-        """Initialize layers of the head."""
-        super()._init_layers()
-        self.conv_centerness_prev = self._init_branch(
-            conv_channels=self.centerness_branch,
-            conv_strides=(1, ) * len(self.centerness_branch))
-        self.conv_centerness = nn.Conv2d(self.centerness_branch[-1], 1, 1)
-        self.scale_dim = 3  # only for offset, depth and size regression
-        self.scales = nn.ModuleList([
-            nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)])
-            for _ in self.strides
-        ])
-
-    def init_weights(self):
-        """Initialize weights of the head.
-
-        We currently still use the customized init_weights because the default
-        init of DCN triggered by the init_cfg will init conv_offset.weight,
-        which mistakenly affects the training stability.
-        """
-        super().init_weights()
-        for m in self.conv_centerness_prev:
-            if isinstance(m.conv, nn.Conv2d):
-                normal_init(m.conv, std=0.01)
-        normal_init(self.conv_centerness, std=0.01)
-
-    def forward(self, feats):
-        """Forward features from the upstream network.
-
-        Args:
-            feats (tuple[Tensor]): Features from the upstream network, each is
-                a 4D-tensor.
-
-        Returns:
-            tuple:
-                cls_scores (list[Tensor]): Box scores for each scale level,
-                    each is a 4D-tensor, the channel number is
-                    num_points * num_classes.
-                bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                    level, each is a 4D-tensor, the channel number is
-                    num_points * bbox_code_size.
-                dir_cls_preds (list[Tensor]): Box scores for direction class
-                    predictions on each scale level, each is a 4D-tensor,
-                    the channel number is num_points * 2. (bin = 2).
-                attr_preds (list[Tensor]): Attribute scores for each scale
-                    level, each is a 4D-tensor, the channel number is
-                    num_points * num_attrs.
-                centernesses (list[Tensor]): Centerness for each scale level,
-                    each is a 4D-tensor, the channel number is num_points * 1.
-        """
-        # Note: we use [:5] to filter feats and only return predictions
-        return multi_apply(self.forward_single, feats, self.scales,
-                           self.strides)[:5]
-
-    def forward_single(self, x, scale, stride):
-        """Forward features of a single scale level.
-
-        Args:
-            x (Tensor): FPN feature maps of the specified stride.
-            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
-                the bbox prediction.
-            stride (int): The corresponding stride for feature maps, only
-                used to normalize the bbox prediction when self.norm_on_bbox
-                is True.
-
-        Returns:
-            tuple: scores for each class, bbox and direction class
-                predictions, centerness predictions of input feature maps.
-        """
-        cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \
-            super().forward_single(x)
-
-        if self.centerness_on_reg:
-            clone_reg_feat = reg_feat.clone()
-            for conv_centerness_prev_layer in self.conv_centerness_prev:
-                clone_reg_feat = conv_centerness_prev_layer(clone_reg_feat)
-            centerness = self.conv_centerness(clone_reg_feat)
-        else:
-            clone_cls_feat = cls_feat.clone()
-            for conv_centerness_prev_layer in self.conv_centerness_prev:
-                clone_cls_feat = conv_centerness_prev_layer(clone_cls_feat)
-            centerness = self.conv_centerness(clone_cls_feat)
-
-        bbox_pred = self.bbox_coder.decode(bbox_pred, scale, stride,
-                                           self.training, cls_score)
-
-        return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \
-            cls_feat, reg_feat
-
-    @staticmethod
-    def add_sin_difference(boxes1, boxes2):
-        """Convert the rotation difference to difference in sine function.
-
-        Args:
-            boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7
-                and the 7th dimension is rotation dimension.
-            boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and
-                the 7th dimension is rotation dimension.
-
-        Returns:
-            tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th
-                dimensions are changed.
-        """
-        rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
-            boxes2[..., 6:7])
-        rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[...,
-                                                                         6:7])
-        boxes1 = torch.cat(
-            [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1)
-        boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]],
-                           dim=-1)
-        return boxes1, boxes2
-
-    @staticmethod
-    def get_direction_target(reg_targets,
-                             dir_offset=0,
-                             dir_limit_offset=0.0,
-                             num_bins=2,
-                             one_hot=True):
-        """Encode direction to 0 ~ num_bins-1.
-
-        Args:
-            reg_targets (torch.Tensor): Bbox regression targets.
-            dir_offset (int, optional): Direction offset. Default to 0.
-            dir_limit_offset (float, optional): Offset to set the direction
-                range. Default to 0.0.
-            num_bins (int, optional): Number of bins to divide 2*PI.
-                Default to 2.
-            one_hot (bool, optional): Whether to encode as one hot.
-                Default to True.
-
-        Returns:
-            torch.Tensor: Encoded direction targets.
-        """
-        rot_gt = reg_targets[..., 6]
-        offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset,
-                                  2 * np.pi)
-        dir_cls_targets = torch.floor(offset_rot /
-                                      (2 * np.pi / num_bins)).long()
-        dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
-        if one_hot:
-            dir_targets = torch.zeros(
-                *list(dir_cls_targets.shape),
-                num_bins,
-                dtype=reg_targets.dtype,
-                device=dir_cls_targets.device)
-            dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
-            dir_cls_targets = dir_targets
-        return dir_cls_targets
-
-    @force_fp32(
-        apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds',
-                  'centernesses'))
-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             dir_cls_preds,
-             attr_preds,
-             centernesses,
-             gt_bboxes,
-             gt_labels,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             centers2d,
-             depths,
-             attr_labels,
-             img_metas,
-             gt_bboxes_ignore=None):
-        """Compute loss of the head.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level,
-                each is a 4D-tensor, the channel number is
-                num_points * num_classes.
-            bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                level, each is a 4D-tensor, the channel number is
-                num_points * bbox_code_size.
-            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * 2. (bin = 2)
-            attr_preds (list[Tensor]): Attribute scores for each scale level,
-                each is a 4D-tensor, the channel number is
-                num_points * num_attrs.
-            centernesses (list[Tensor]): Centerness for each scale level, each
-                is a 4D-tensor, the channel number is num_points * 1.
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[Tensor]): class indices corresponding to each box
-            gt_bboxes_3d (list[Tensor]): 3D boxes ground truth with shape of
-                (num_gts, code_size).
-            gt_labels_3d (list[Tensor]): same as gt_labels
-            centers2d (list[Tensor]): 2D centers on the image with shape of
-                (num_gts, 2).
-            depths (list[Tensor]): Depth ground truth with shape of
-                (num_gts, ).
-            attr_labels (list[Tensor]): Attributes indices of each box.
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            gt_bboxes_ignore (list[Tensor]): specify which bounding
-                boxes can be ignored when computing the loss.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-        assert len(cls_scores) == len(bbox_preds) == len(centernesses) == len(
-            attr_preds)
-        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
-        all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
-                                           bbox_preds[0].device)
-        labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \
-            self.get_targets(
-                all_level_points, gt_bboxes, gt_labels, gt_bboxes_3d,
-                gt_labels_3d, centers2d, depths, attr_labels)
-
-        num_imgs = cls_scores[0].size(0)
-        # flatten cls_scores, bbox_preds, dir_cls_preds and centerness
-        flatten_cls_scores = [
-            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
-            for cls_score in cls_scores
-        ]
-        flatten_bbox_preds = [
-            bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims))
-            for bbox_pred in bbox_preds
-        ]
-        flatten_dir_cls_preds = [
-            dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2)
-            for dir_cls_pred in dir_cls_preds
-        ]
-        flatten_centerness = [
-            centerness.permute(0, 2, 3, 1).reshape(-1)
-            for centerness in centernesses
-        ]
-        flatten_cls_scores = torch.cat(flatten_cls_scores)
-        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
-        flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds)
-        flatten_centerness = torch.cat(flatten_centerness)
-        flatten_labels_3d = torch.cat(labels_3d)
-        flatten_bbox_targets_3d = torch.cat(bbox_targets_3d)
-        flatten_centerness_targets = torch.cat(centerness_targets)
-
-        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
-        bg_class_ind = self.num_classes
-        pos_inds = ((flatten_labels_3d >= 0)
-                    & (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1)
-        num_pos = len(pos_inds)
-
-        loss_cls = self.loss_cls(
-            flatten_cls_scores,
-            flatten_labels_3d,
-            avg_factor=num_pos + num_imgs)  # avoid num_pos is 0
-
-        pos_bbox_preds = flatten_bbox_preds[pos_inds]
-        pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds]
-        pos_centerness = flatten_centerness[pos_inds]
-
-        if self.pred_attrs:
-            flatten_attr_preds = [
-                attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs)
-                for attr_pred in attr_preds
-            ]
-            flatten_attr_preds = torch.cat(flatten_attr_preds)
-            flatten_attr_targets = torch.cat(attr_targets)
-            pos_attr_preds = flatten_attr_preds[pos_inds]
-
-        if num_pos > 0:
-            pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
-            pos_centerness_targets = flatten_centerness_targets[pos_inds]
-            if self.pred_attrs:
-                pos_attr_targets = flatten_attr_targets[pos_inds]
-            bbox_weights = pos_centerness_targets.new_ones(
-                len(pos_centerness_targets), sum(self.group_reg_dims))
-            equal_weights = pos_centerness_targets.new_ones(
-                pos_centerness_targets.shape)
-
-            code_weight = self.train_cfg.get('code_weight', None)
-            if code_weight:
-                assert len(code_weight) == sum(self.group_reg_dims)
-                bbox_weights = bbox_weights * bbox_weights.new_tensor(
-                    code_weight)
-
-            if self.use_direction_classifier:
-                pos_dir_cls_targets = self.get_direction_target(
-                    pos_bbox_targets_3d,
-                    self.dir_offset,
-                    self.dir_limit_offset,
-                    one_hot=False)
-
-            if self.diff_rad_by_sin:
-                pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference(
-                    pos_bbox_preds, pos_bbox_targets_3d)
-
-            loss_offset = self.loss_bbox(
-                pos_bbox_preds[:, :2],
-                pos_bbox_targets_3d[:, :2],
-                weight=bbox_weights[:, :2],
-                avg_factor=equal_weights.sum())
-            loss_depth = self.loss_bbox(
-                pos_bbox_preds[:, 2],
-                pos_bbox_targets_3d[:, 2],
-                weight=bbox_weights[:, 2],
-                avg_factor=equal_weights.sum())
-            loss_size = self.loss_bbox(
-                pos_bbox_preds[:, 3:6],
-                pos_bbox_targets_3d[:, 3:6],
-                weight=bbox_weights[:, 3:6],
-                avg_factor=equal_weights.sum())
-            loss_rotsin = self.loss_bbox(
-                pos_bbox_preds[:, 6],
-                pos_bbox_targets_3d[:, 6],
-                weight=bbox_weights[:, 6],
-                avg_factor=equal_weights.sum())
-            loss_velo = None
-            if self.pred_velo:
-                loss_velo = self.loss_bbox(
-                    pos_bbox_preds[:, 7:9],
-                    pos_bbox_targets_3d[:, 7:9],
-                    weight=bbox_weights[:, 7:9],
-                    avg_factor=equal_weights.sum())
-
-            loss_centerness = self.loss_centerness(pos_centerness,
-                                                   pos_centerness_targets)
-
-            # direction classification loss
-            loss_dir = None
-            # TODO: add more check for use_direction_classifier
-            if self.use_direction_classifier:
-                loss_dir = self.loss_dir(
-                    pos_dir_cls_preds,
-                    pos_dir_cls_targets,
-                    equal_weights,
-                    avg_factor=equal_weights.sum())
-
-            # attribute classification loss
-            loss_attr = None
-            if self.pred_attrs:
-                loss_attr = self.loss_attr(
-                    pos_attr_preds,
-                    pos_attr_targets,
-                    pos_centerness_targets,
-                    avg_factor=pos_centerness_targets.sum())
-
-        else:
-            # need absolute due to possible negative delta x/y
-            loss_offset = pos_bbox_preds[:, :2].sum()
-            loss_depth = pos_bbox_preds[:, 2].sum()
-            loss_size = pos_bbox_preds[:, 3:6].sum()
-            loss_rotsin = pos_bbox_preds[:, 6].sum()
-            loss_velo = None
-            if self.pred_velo:
-                loss_velo = pos_bbox_preds[:, 7:9].sum()
-            loss_centerness = pos_centerness.sum()
-            loss_dir = None
-            if self.use_direction_classifier:
-                loss_dir = pos_dir_cls_preds.sum()
-            loss_attr = None
-            if self.pred_attrs:
-                loss_attr = pos_attr_preds.sum()
-
-        loss_dict = dict(
-            loss_cls=loss_cls,
-            loss_offset=loss_offset,
-            loss_depth=loss_depth,
-            loss_size=loss_size,
-            loss_rotsin=loss_rotsin,
-            loss_centerness=loss_centerness)
-
-        if loss_velo is not None:
-            loss_dict['loss_velo'] = loss_velo
-
-        if loss_dir is not None:
-            loss_dict['loss_dir'] = loss_dir
-
-        if loss_attr is not None:
-            loss_dict['loss_attr'] = loss_attr
-
-        return loss_dict
-
-    @force_fp32(
-        apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds',
-                  'centernesses'))
-    def get_bboxes(self,
-                   cls_scores,
-                   bbox_preds,
-                   dir_cls_preds,
-                   attr_preds,
-                   centernesses,
-                   img_metas,
-                   cfg=None,
-                   rescale=None):
-        """Transform network output for a batch into bbox predictions.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level
-                Has shape (N, num_points * num_classes, H, W)
-            bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                level with shape (N, num_points * 4, H, W)
-            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * 2. (bin = 2)
-            attr_preds (list[Tensor]): Attribute scores for each scale level
-                Has shape (N, num_points * num_attrs, H, W)
-            centernesses (list[Tensor]): Centerness for each scale level with
-                shape (N, num_points * 1, H, W)
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            cfg (mmcv.Config): Test / postprocessing configuration,
-                if None, test_cfg would be used
-            rescale (bool): If True, return boxes in original image space
-
-        Returns:
-            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
-                The first item is an (n, 5) tensor, where the first 4 columns
-                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
-                5-th column is a score between 0 and 1. The second item is a
-                (n,) tensor where each item is the predicted class label of
-                the corresponding box.
-        """
-        assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
-            len(centernesses) == len(attr_preds)
-        num_levels = len(cls_scores)
-
-        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
-        mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
-                                      bbox_preds[0].device)
-        result_list = []
-        for img_id in range(len(img_metas)):
-            cls_score_list = [
-                cls_scores[i][img_id].detach() for i in range(num_levels)
-            ]
-            bbox_pred_list = [
-                bbox_preds[i][img_id].detach() for i in range(num_levels)
-            ]
-            if self.use_direction_classifier:
-                dir_cls_pred_list = [
-                    dir_cls_preds[i][img_id].detach()
-                    for i in range(num_levels)
-                ]
-            else:
-                dir_cls_pred_list = [
-                    cls_scores[i][img_id].new_full(
-                        [2, *cls_scores[i][img_id].shape[1:]], 0).detach()
-                    for i in range(num_levels)
-                ]
-            if self.pred_attrs:
-                attr_pred_list = [
-                    attr_preds[i][img_id].detach() for i in range(num_levels)
-                ]
-            else:
-                attr_pred_list = [
-                    cls_scores[i][img_id].new_full(
-                        [self.num_attrs, *cls_scores[i][img_id].shape[1:]],
-                        self.attr_background_label).detach()
-                    for i in range(num_levels)
-                ]
-            centerness_pred_list = [
-                centernesses[i][img_id].detach() for i in range(num_levels)
-            ]
-            input_meta = img_metas[img_id]
-            det_bboxes = self._get_bboxes_single(
-                cls_score_list, bbox_pred_list, dir_cls_pred_list,
-                attr_pred_list, centerness_pred_list, mlvl_points, input_meta,
-                cfg, rescale)
-            result_list.append(det_bboxes)
-        return result_list
-
-    def _get_bboxes_single(self,
-                           cls_scores,
-                           bbox_preds,
-                           dir_cls_preds,
-                           attr_preds,
-                           centernesses,
-                           mlvl_points,
-                           input_meta,
-                           cfg,
-                           rescale=False):
-        """Transform outputs for a single batch item into bbox predictions.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for a single scale level
-                Has shape (num_points * num_classes, H, W).
-            bbox_preds (list[Tensor]): Box energies / deltas for a single scale
-                level with shape (num_points * bbox_code_size, H, W).
-            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on a single scale level with shape
-                (num_points * 2, H, W)
-            attr_preds (list[Tensor]): Attribute scores for each scale level
-                Has shape (N, num_points * num_attrs, H, W)
-            centernesses (list[Tensor]): Centerness for a single scale level
-                with shape (num_points, H, W).
-            mlvl_points (list[Tensor]): Box reference for a single scale level
-                with shape (num_total_points, 2).
-            input_meta (dict): Metadata of input image.
-            cfg (mmcv.Config): Test / postprocessing configuration,
-                if None, test_cfg would be used.
-            rescale (bool): If True, return boxes in original image space.
-
-        Returns:
-            tuples[Tensor]: Predicted 3D boxes, scores, labels and attributes.
-        """
-        view = np.array(input_meta['cam2img'])
-        scale_factor = input_meta['scale_factor']
-        cfg = self.test_cfg if cfg is None else cfg
-        assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
-        mlvl_centers2d = []
-        mlvl_bboxes = []
-        mlvl_scores = []
-        mlvl_dir_scores = []
-        mlvl_attr_scores = []
-        mlvl_centerness = []
-
-        for cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \
-                points in zip(cls_scores, bbox_preds, dir_cls_preds,
-                              attr_preds, centernesses, mlvl_points):
-            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
-            scores = cls_score.permute(1, 2, 0).reshape(
-                -1, self.cls_out_channels).sigmoid()
-            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
-            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
-            attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs)
-            attr_score = torch.max(attr_pred, dim=-1)[1]
-            centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
-
-            bbox_pred = bbox_pred.permute(1, 2,
-                                          0).reshape(-1,
-                                                     sum(self.group_reg_dims))
-            bbox_pred = bbox_pred[:, :self.bbox_code_size]
-            nms_pre = cfg.get('nms_pre', -1)
-            if nms_pre > 0 and scores.shape[0] > nms_pre:
-                max_scores, _ = (scores * centerness[:, None]).max(dim=1)
-                _, topk_inds = max_scores.topk(nms_pre)
-                points = points[topk_inds, :]
-                bbox_pred = bbox_pred[topk_inds, :]
-                scores = scores[topk_inds, :]
-                dir_cls_pred = dir_cls_pred[topk_inds, :]
-                centerness = centerness[topk_inds]
-                dir_cls_score = dir_cls_score[topk_inds]
-                attr_score = attr_score[topk_inds]
-            # change the offset to actual center predictions
-            bbox_pred[:, :2] = points - bbox_pred[:, :2]
-            if rescale:
-                bbox_pred[:, :2] /= bbox_pred[:, :2].new_tensor(scale_factor)
-            pred_center2d = bbox_pred[:, :3].clone()
-            bbox_pred[:, :3] = points_img2cam(bbox_pred[:, :3], view)
-            mlvl_centers2d.append(pred_center2d)
-            mlvl_bboxes.append(bbox_pred)
-            mlvl_scores.append(scores)
-            mlvl_dir_scores.append(dir_cls_score)
-            mlvl_attr_scores.append(attr_score)
-            mlvl_centerness.append(centerness)
-
-        mlvl_centers2d = torch.cat(mlvl_centers2d)
-        mlvl_bboxes = torch.cat(mlvl_bboxes)
-        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
-
-        # change local yaw to global yaw for 3D nms
-        cam2img = mlvl_centers2d.new_zeros((4, 4))
-        cam2img[:view.shape[0], :view.shape[1]] = \
-            mlvl_centers2d.new_tensor(view)
-        mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d,
-                                                 mlvl_dir_scores,
-                                                 self.dir_offset, cam2img)
-
-        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
-            mlvl_bboxes, box_dim=self.bbox_code_size,
-            origin=(0.5, 0.5, 0.5)).bev)
-
-        mlvl_scores = torch.cat(mlvl_scores)
-        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
-        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
-        # BG cat_id: num_class
-        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
-        mlvl_attr_scores = torch.cat(mlvl_attr_scores)
-        mlvl_centerness = torch.cat(mlvl_centerness)
-        # no scale_factors in box3d_multiclass_nms
-        # Then we multiply it from outside
-        mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None]
-        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
-                                       mlvl_nms_scores, cfg.score_thr,
-                                       cfg.max_per_img, cfg, mlvl_dir_scores,
-                                       mlvl_attr_scores)
-        bboxes, scores, labels, dir_scores, attrs = results
-        attrs = attrs.to(labels.dtype)  # change data type to int
-        bboxes = input_meta['box_type_3d'](
-            bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
-        # Note that the predictions use origin (0.5, 0.5, 0.5)
-        # Due to the ground truth centers2d are the gravity center of objects
-        # v0.10.0 fix inplace operation to the input tensor of cam_box3d
-        # So here we also need to add origin=(0.5, 0.5, 0.5)
-        if not self.pred_attrs:
-            attrs = None
-
-        return bboxes, scores, labels, attrs
-
-    @staticmethod
-    def pts2Dto3D(points, view):
-        """
-        Args:
-            points (torch.Tensor): points in 2D images, [N, 3],
-                3 corresponds with x, y in the image and depth.
-            view (np.ndarray): camera intrinsic, [3, 3]
-
-        Returns:
-            torch.Tensor: points in 3D space. [N, 3],
-                3 corresponds with x, y, z in 3D space.
-        """
-        warning.warn('DeprecationWarning: This static method has been moved '
-                     'out of this class to mmdet3d/core. The function '
-                     'pts2Dto3D will be deprecated.')
-
-        assert view.shape[0] <= 4
-        assert view.shape[1] <= 4
-        assert points.shape[1] == 3
-
-        points2D = points[:, :2]
-        depths = points[:, 2].view(-1, 1)
-        unnorm_points2D = torch.cat([points2D * depths, depths], dim=1)
-
-        viewpad = torch.eye(4, dtype=points2D.dtype, device=points2D.device)
-        viewpad[:view.shape[0], :view.shape[1]] = points2D.new_tensor(view)
-        inv_viewpad = torch.inverse(viewpad).transpose(0, 1)
-
-        # Do operation in homogeneous coordinates.
-        nbr_points = unnorm_points2D.shape[0]
-        homo_points2D = torch.cat(
-            [unnorm_points2D,
-             points2D.new_ones((nbr_points, 1))], dim=1)
-        points3D = torch.mm(homo_points2D, inv_viewpad)[:, :3]
-
-        return points3D
-
-    def _get_points_single(self,
-                           featmap_size,
-                           stride,
-                           dtype,
-                           device,
-                           flatten=False):
-        """Get points according to feature map sizes."""
-        y, x = super()._get_points_single(featmap_size, stride, dtype, device)
-        points = torch.stack((x.reshape(-1) * stride, y.reshape(-1) * stride),
-                             dim=-1) + stride // 2
-        return points
-
-    def get_targets(self, points, gt_bboxes_list, gt_labels_list,
-                    gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
-                    depths_list, attr_labels_list):
-        """Compute regression, classification and centerss targets for points
-        in multiple images.
-
-        Args:
-            points (list[Tensor]): Points of each fpn level, each has shape
-                (num_points, 2).
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
-                each has shape (num_gt, 4).
-            gt_labels_list (list[Tensor]): Ground truth labels of each box,
-                each has shape (num_gt,).
-            gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
-                image, each has shape (num_gt, bbox_code_size).
-            gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
-                box, each has shape (num_gt,).
-            centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
-                each has shape (num_gt, 2).
-            depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
-                image, each has shape (num_gt, 1).
-            attr_labels_list (list[Tensor]): Attribute labels of each box,
-                each has shape (num_gt,).
-
-        Returns:
-            tuple:
-                concat_lvl_labels (list[Tensor]): Labels of each level.
-                concat_lvl_bbox_targets (list[Tensor]): BBox targets of each
-                    level.
-        """
-        assert len(points) == len(self.regress_ranges)
-        num_levels = len(points)
-        # expand regress ranges to align with points
-        expanded_regress_ranges = [
-            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
-                points[i]) for i in range(num_levels)
-        ]
-        # concat all levels points and regress ranges
-        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
-        concat_points = torch.cat(points, dim=0)
-
-        # the number of points per img, per lvl
-        num_points = [center.size(0) for center in points]
-
-        if attr_labels_list is None:
-            attr_labels_list = [
-                gt_labels.new_full(gt_labels.shape, self.attr_background_label)
-                for gt_labels in gt_labels_list
-            ]
-
-        # get labels and bbox_targets of each image
-        _, _, labels_3d_list, bbox_targets_3d_list, centerness_targets_list, \
-            attr_targets_list = multi_apply(
-                self._get_target_single,
-                gt_bboxes_list,
-                gt_labels_list,
-                gt_bboxes_3d_list,
-                gt_labels_3d_list,
-                centers2d_list,
-                depths_list,
-                attr_labels_list,
-                points=concat_points,
-                regress_ranges=concat_regress_ranges,
-                num_points_per_lvl=num_points)
-
-        # split to per img, per level
-        labels_3d_list = [
-            labels_3d.split(num_points, 0) for labels_3d in labels_3d_list
-        ]
-        bbox_targets_3d_list = [
-            bbox_targets_3d.split(num_points, 0)
-            for bbox_targets_3d in bbox_targets_3d_list
-        ]
-        centerness_targets_list = [
-            centerness_targets.split(num_points, 0)
-            for centerness_targets in centerness_targets_list
-        ]
-        attr_targets_list = [
-            attr_targets.split(num_points, 0)
-            for attr_targets in attr_targets_list
-        ]
-
-        # concat per level image
-        concat_lvl_labels_3d = []
-        concat_lvl_bbox_targets_3d = []
-        concat_lvl_centerness_targets = []
-        concat_lvl_attr_targets = []
-        for i in range(num_levels):
-            concat_lvl_labels_3d.append(
-                torch.cat([labels[i] for labels in labels_3d_list]))
-            concat_lvl_centerness_targets.append(
-                torch.cat([
-                    centerness_targets[i]
-                    for centerness_targets in centerness_targets_list
-                ]))
-            bbox_targets_3d = torch.cat([
-                bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list
-            ])
-            concat_lvl_attr_targets.append(
-                torch.cat(
-                    [attr_targets[i] for attr_targets in attr_targets_list]))
-            if self.norm_on_bbox:
-                bbox_targets_3d[:, :
-                                2] = bbox_targets_3d[:, :2] / self.strides[i]
-            concat_lvl_bbox_targets_3d.append(bbox_targets_3d)
-        return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \
-            concat_lvl_centerness_targets, concat_lvl_attr_targets
-
-    def _get_target_single(self, gt_bboxes, gt_labels, gt_bboxes_3d,
-                           gt_labels_3d, centers2d, depths, attr_labels,
-                           points, regress_ranges, num_points_per_lvl):
-        """Compute regression and classification targets for a single image."""
-        num_points = points.size(0)
-        num_gts = gt_labels.size(0)
-        if not isinstance(gt_bboxes_3d, torch.Tensor):
-            gt_bboxes_3d = gt_bboxes_3d.tensor.to(gt_bboxes.device)
-        if num_gts == 0:
-            return gt_labels.new_full((num_points,), self.background_label), \
-                   gt_bboxes.new_zeros((num_points, 4)), \
-                   gt_labels_3d.new_full(
-                       (num_points,), self.background_label), \
-                   gt_bboxes_3d.new_zeros((num_points, self.bbox_code_size)), \
-                   gt_bboxes_3d.new_zeros((num_points,)), \
-                   attr_labels.new_full(
-                       (num_points,), self.attr_background_label)
-
-        # change orientation to local yaw
-        gt_bboxes_3d[..., 6] = -torch.atan2(
-            gt_bboxes_3d[..., 0], gt_bboxes_3d[..., 2]) + gt_bboxes_3d[..., 6]
-
-        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
-            gt_bboxes[:, 3] - gt_bboxes[:, 1])
-        areas = areas[None].repeat(num_points, 1)
-        regress_ranges = regress_ranges[:, None, :].expand(
-            num_points, num_gts, 2)
-        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
-        centers2d = centers2d[None].expand(num_points, num_gts, 2)
-        gt_bboxes_3d = gt_bboxes_3d[None].expand(num_points, num_gts,
-                                                 self.bbox_code_size)
-        depths = depths[None, :, None].expand(num_points, num_gts, 1)
-        xs, ys = points[:, 0], points[:, 1]
-        xs = xs[:, None].expand(num_points, num_gts)
-        ys = ys[:, None].expand(num_points, num_gts)
-
-        delta_xs = (xs - centers2d[..., 0])[..., None]
-        delta_ys = (ys - centers2d[..., 1])[..., None]
-        bbox_targets_3d = torch.cat(
-            (delta_xs, delta_ys, depths, gt_bboxes_3d[..., 3:]), dim=-1)
-
-        left = xs - gt_bboxes[..., 0]
-        right = gt_bboxes[..., 2] - xs
-        top = ys - gt_bboxes[..., 1]
-        bottom = gt_bboxes[..., 3] - ys
-        bbox_targets = torch.stack((left, top, right, bottom), -1)
-
-        assert self.center_sampling is True, 'Setting center_sampling to '\
-            'False has not been implemented for FCOS3D.'
-        # condition1: inside a `center bbox`
-        radius = self.center_sample_radius
-        center_xs = centers2d[..., 0]
-        center_ys = centers2d[..., 1]
-        center_gts = torch.zeros_like(gt_bboxes)
-        stride = center_xs.new_zeros(center_xs.shape)
-
-        # project the points on current lvl back to the `original` sizes
-        lvl_begin = 0
-        for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
-            lvl_end = lvl_begin + num_points_lvl
-            stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
-            lvl_begin = lvl_end
-
-        center_gts[..., 0] = center_xs - stride
-        center_gts[..., 1] = center_ys - stride
-        center_gts[..., 2] = center_xs + stride
-        center_gts[..., 3] = center_ys + stride
-
-        cb_dist_left = xs - center_gts[..., 0]
-        cb_dist_right = center_gts[..., 2] - xs
-        cb_dist_top = ys - center_gts[..., 1]
-        cb_dist_bottom = center_gts[..., 3] - ys
-        center_bbox = torch.stack(
-            (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
-        inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
-
-        # condition2: limit the regression range for each location
-        max_regress_distance = bbox_targets.max(-1)[0]
-        inside_regress_range = (
-            (max_regress_distance >= regress_ranges[..., 0])
-            & (max_regress_distance <= regress_ranges[..., 1]))
-
-        # center-based criterion to deal with ambiguity
-        dists = torch.sqrt(torch.sum(bbox_targets_3d[..., :2]**2, dim=-1))
-        dists[inside_gt_bbox_mask == 0] = INF
-        dists[inside_regress_range == 0] = INF
-        min_dist, min_dist_inds = dists.min(dim=1)
-
-        labels = gt_labels[min_dist_inds]
-        labels_3d = gt_labels_3d[min_dist_inds]
-        attr_labels = attr_labels[min_dist_inds]
-        labels[min_dist == INF] = self.background_label  # set as BG
-        labels_3d[min_dist == INF] = self.background_label  # set as BG
-        attr_labels[min_dist == INF] = self.attr_background_label
-
-        bbox_targets = bbox_targets[range(num_points), min_dist_inds]
-        bbox_targets_3d = bbox_targets_3d[range(num_points), min_dist_inds]
-        relative_dists = torch.sqrt(
-            torch.sum(bbox_targets_3d[..., :2]**2,
-                      dim=-1)) / (1.414 * stride[:, 0])
-        # [N, 1] / [N, 1]
-        centerness_targets = torch.exp(-self.centerness_alpha * relative_dists)
-
-        return labels, bbox_targets, labels_3d, bbox_targets_3d, \
-            centerness_targets, attr_labels
+# Copyright (c) OpenMMLab. All rights reserved.
+from logging import warning
+
+import numpy as np
+import torch
+from mmcv.cnn import Scale, normal_init
+from mmcv.runner import force_fp32
+from torch import nn as nn
+
+from mmdet3d.core import (box3d_multiclass_nms, limit_period, points_img2cam,
+                          xywhr2xyxyr)
+from mmdet.core import multi_apply
+from mmdet.core.bbox.builder import build_bbox_coder
+from ..builder import HEADS, build_loss
+from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+
+INF = 1e8
+
+
+@HEADS.register_module()
+class FCOSMono3DHead(AnchorFreeMono3DHead):
+    """Anchor-free head used in FCOS3D.
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        regress_ranges (tuple[tuple[int, int]], optional): Regress range of multiple
+            level points.
+        center_sampling (bool, optional): If true, use center sampling. Default: True.
+        center_sample_radius (float, optional): Radius of center sampling. Default: 1.5.
+        norm_on_bbox (bool, optional): If true, normalize the regression targets
+            with FPN strides. Default: True.
+        centerness_on_reg (bool, optional): If true, position centerness on the
+            regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042.
+            Default: True.
+        centerness_alpha (int, optional): Parameter used to adjust the intensity
+            attenuation from the center to the periphery. Default: 2.5.
+        loss_cls (dict, optional): Config of classification loss.
+        loss_bbox (dict, optional): Config of localization loss.
+        loss_dir (dict, optional): Config of direction classification loss.
+        loss_attr (dict, optional): Config of attribute classification loss.
+        loss_centerness (dict, optional): Config of centerness loss.
+        norm_cfg (dict, optional): dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        centerness_branch (tuple[int], optional): Channels for centerness branch.
+            Default: (64, ).
+    """  # noqa: E501
+
+    def __init__(self,
+                 regress_ranges=((-1, 48), (48, 96), (96, 192), (192, 384),
+                                 (384, INF)),
+                 center_sampling=True,
+                 center_sample_radius=1.5,
+                 norm_on_bbox=True,
+                 centerness_on_reg=True,
+                 centerness_alpha=2.5,
+                 loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 loss_dir=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_attr=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 loss_centerness=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9),
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 centerness_branch=(64, ),
+                 init_cfg=None,
+                 **kwargs):
+        self.regress_ranges = regress_ranges
+        self.center_sampling = center_sampling
+        self.center_sample_radius = center_sample_radius
+        self.norm_on_bbox = norm_on_bbox
+        self.centerness_on_reg = centerness_on_reg
+        self.centerness_alpha = centerness_alpha
+        self.centerness_branch = centerness_branch
+        super().__init__(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_dir=loss_dir,
+            loss_attr=loss_attr,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.loss_centerness = build_loss(loss_centerness)
+        bbox_coder['code_size'] = self.bbox_code_size
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        super()._init_layers()
+        self.conv_centerness_prev = self._init_branch(
+            conv_channels=self.centerness_branch,
+            conv_strides=(1, ) * len(self.centerness_branch))
+        self.conv_centerness = nn.Conv2d(self.centerness_branch[-1], 1, 1)
+        self.scale_dim = 3  # only for offset, depth and size regression
+        self.scales = nn.ModuleList([
+            nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)])
+            for _ in self.strides
+        ])
+
+    def init_weights(self):
+        """Initialize weights of the head.
+
+        We currently still use the customized init_weights because the default
+        init of DCN triggered by the init_cfg will init conv_offset.weight,
+        which mistakenly affects the training stability.
+        """
+        super().init_weights()
+        for m in self.conv_centerness_prev:
+            if isinstance(m.conv, nn.Conv2d):
+                normal_init(m.conv, std=0.01)
+        normal_init(self.conv_centerness, std=0.01)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+                dir_cls_preds (list[Tensor]): Box scores for direction class
+                    predictions on each scale level, each is a 4D-tensor,
+                    the channel number is num_points * 2. (bin = 2).
+                attr_preds (list[Tensor]): Attribute scores for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * num_attrs.
+                centernesses (list[Tensor]): Centerness for each scale level,
+                    each is a 4D-tensor, the channel number is num_points * 1.
+        """
+        # Note: we use [:5] to filter feats and only return predictions
+        return multi_apply(self.forward_single, feats, self.scales,
+                           self.strides)[:5]
+
+    def forward_single(self, x, scale, stride):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox and direction class
+                predictions, centerness predictions of input feature maps.
+        """
+        cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \
+            super().forward_single(x)
+
+        if self.centerness_on_reg:
+            clone_reg_feat = reg_feat.clone()
+            for conv_centerness_prev_layer in self.conv_centerness_prev:
+                clone_reg_feat = conv_centerness_prev_layer(clone_reg_feat)
+            centerness = self.conv_centerness(clone_reg_feat)
+        else:
+            clone_cls_feat = cls_feat.clone()
+            for conv_centerness_prev_layer in self.conv_centerness_prev:
+                clone_cls_feat = conv_centerness_prev_layer(clone_cls_feat)
+            centerness = self.conv_centerness(clone_cls_feat)
+
+        bbox_pred = self.bbox_coder.decode(bbox_pred, scale, stride,
+                                           self.training, cls_score)
+
+        return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \
+            cls_feat, reg_feat
+
+    @staticmethod
+    def add_sin_difference(boxes1, boxes2):
+        """Convert the rotation difference to difference in sine function.
+
+        Args:
+            boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7
+                and the 7th dimension is rotation dimension.
+            boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and
+                the 7th dimension is rotation dimension.
+
+        Returns:
+            tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th
+                dimensions are changed.
+        """
+        rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
+            boxes2[..., 6:7])
+        rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[...,
+                                                                         6:7])
+        boxes1 = torch.cat(
+            [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1)
+        boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]],
+                           dim=-1)
+        return boxes1, boxes2
+
+    @staticmethod
+    def get_direction_target(reg_targets,
+                             dir_offset=0,
+                             dir_limit_offset=0.0,
+                             num_bins=2,
+                             one_hot=True):
+        """Encode direction to 0 ~ num_bins-1.
+
+        Args:
+            reg_targets (torch.Tensor): Bbox regression targets.
+            dir_offset (int, optional): Direction offset. Default to 0.
+            dir_limit_offset (float, optional): Offset to set the direction
+                range. Default to 0.0.
+            num_bins (int, optional): Number of bins to divide 2*PI.
+                Default to 2.
+            one_hot (bool, optional): Whether to encode as one hot.
+                Default to True.
+
+        Returns:
+            torch.Tensor: Encoded direction targets.
+        """
+        rot_gt = reg_targets[..., 6]
+        offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset,
+                                  2 * np.pi)
+        dir_cls_targets = torch.floor(offset_rot /
+                                      (2 * np.pi / num_bins)).long()
+        dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
+        if one_hot:
+            dir_targets = torch.zeros(
+                *list(dir_cls_targets.shape),
+                num_bins,
+                dtype=reg_targets.dtype,
+                device=dir_cls_targets.device)
+            dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
+            dir_cls_targets = dir_targets
+        return dir_cls_targets
+
+    @force_fp32(
+        apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds',
+                  'centernesses'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             dir_cls_preds,
+             attr_preds,
+             centernesses,
+             gt_bboxes,
+             gt_labels,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             centers2d,
+             depths,
+             attr_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * bbox_code_size.
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            attr_preds (list[Tensor]): Attribute scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_attrs.
+            centernesses (list[Tensor]): Centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_3d (list[Tensor]): 3D boxes ground truth with shape of
+                (num_gts, code_size).
+            gt_labels_3d (list[Tensor]): same as gt_labels
+            centers2d (list[Tensor]): 2D centers on the image with shape of
+                (num_gts, 2).
+            depths (list[Tensor]): Depth ground truth with shape of
+                (num_gts, ).
+            attr_labels (list[Tensor]): Attributes indices of each box.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(centernesses) == len(
+            attr_preds)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                           bbox_preds[0].device)
+        labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \
+            self.get_targets(
+                all_level_points, gt_bboxes, gt_labels, gt_bboxes_3d,
+                gt_labels_3d, centers2d, depths, attr_labels)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores, bbox_preds, dir_cls_preds and centerness
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims))
+            for bbox_pred in bbox_preds
+        ]
+        flatten_dir_cls_preds = [
+            dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2)
+            for dir_cls_pred in dir_cls_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        flatten_labels_3d = torch.cat(labels_3d)
+        flatten_bbox_targets_3d = torch.cat(bbox_targets_3d)
+        flatten_centerness_targets = torch.cat(centerness_targets)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels_3d >= 0)
+                    & (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = len(pos_inds)
+
+        loss_cls = self.loss_cls(
+            flatten_cls_scores,
+            flatten_labels_3d,
+            avg_factor=num_pos + num_imgs)  # avoid num_pos is 0
+
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+
+        if self.pred_attrs:
+            flatten_attr_preds = [
+                attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs)
+                for attr_pred in attr_preds
+            ]
+            flatten_attr_preds = torch.cat(flatten_attr_preds)
+            flatten_attr_targets = torch.cat(attr_targets)
+            pos_attr_preds = flatten_attr_preds[pos_inds]
+
+        if num_pos > 0:
+            pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
+            pos_centerness_targets = flatten_centerness_targets[pos_inds]
+            if self.pred_attrs:
+                pos_attr_targets = flatten_attr_targets[pos_inds]
+            bbox_weights = pos_centerness_targets.new_ones(
+                len(pos_centerness_targets), sum(self.group_reg_dims))
+            equal_weights = pos_centerness_targets.new_ones(
+                pos_centerness_targets.shape)
+
+            code_weight = self.train_cfg.get('code_weight', None)
+            if code_weight:
+                assert len(code_weight) == sum(self.group_reg_dims)
+                bbox_weights = bbox_weights * bbox_weights.new_tensor(
+                    code_weight)
+
+            if self.use_direction_classifier:
+                pos_dir_cls_targets = self.get_direction_target(
+                    pos_bbox_targets_3d,
+                    self.dir_offset,
+                    self.dir_limit_offset,
+                    one_hot=False)
+
+            if self.diff_rad_by_sin:
+                pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference(
+                    pos_bbox_preds, pos_bbox_targets_3d)
+
+            loss_offset = self.loss_bbox(
+                pos_bbox_preds[:, :2],
+                pos_bbox_targets_3d[:, :2],
+                weight=bbox_weights[:, :2],
+                avg_factor=equal_weights.sum())
+            loss_depth = self.loss_bbox(
+                pos_bbox_preds[:, 2],
+                pos_bbox_targets_3d[:, 2],
+                weight=bbox_weights[:, 2],
+                avg_factor=equal_weights.sum())
+            loss_size = self.loss_bbox(
+                pos_bbox_preds[:, 3:6],
+                pos_bbox_targets_3d[:, 3:6],
+                weight=bbox_weights[:, 3:6],
+                avg_factor=equal_weights.sum())
+            loss_rotsin = self.loss_bbox(
+                pos_bbox_preds[:, 6],
+                pos_bbox_targets_3d[:, 6],
+                weight=bbox_weights[:, 6],
+                avg_factor=equal_weights.sum())
+            loss_velo = None
+            if self.pred_velo:
+                loss_velo = self.loss_bbox(
+                    pos_bbox_preds[:, 7:9],
+                    pos_bbox_targets_3d[:, 7:9],
+                    weight=bbox_weights[:, 7:9],
+                    avg_factor=equal_weights.sum())
+
+            loss_centerness = self.loss_centerness(pos_centerness,
+                                                   pos_centerness_targets)
+
+            # direction classification loss
+            loss_dir = None
+            # TODO: add more check for use_direction_classifier
+            if self.use_direction_classifier:
+                loss_dir = self.loss_dir(
+                    pos_dir_cls_preds,
+                    pos_dir_cls_targets,
+                    equal_weights,
+                    avg_factor=equal_weights.sum())
+
+            # attribute classification loss
+            loss_attr = None
+            if self.pred_attrs:
+                loss_attr = self.loss_attr(
+                    pos_attr_preds,
+                    pos_attr_targets,
+                    pos_centerness_targets,
+                    avg_factor=pos_centerness_targets.sum())
+
+        else:
+            # need absolute due to possible negative delta x/y
+            loss_offset = pos_bbox_preds[:, :2].sum()
+            loss_depth = pos_bbox_preds[:, 2].sum()
+            loss_size = pos_bbox_preds[:, 3:6].sum()
+            loss_rotsin = pos_bbox_preds[:, 6].sum()
+            loss_velo = None
+            if self.pred_velo:
+                loss_velo = pos_bbox_preds[:, 7:9].sum()
+            loss_centerness = pos_centerness.sum()
+            loss_dir = None
+            if self.use_direction_classifier:
+                loss_dir = pos_dir_cls_preds.sum()
+            loss_attr = None
+            if self.pred_attrs:
+                loss_attr = pos_attr_preds.sum()
+
+        loss_dict = dict(
+            loss_cls=loss_cls,
+            loss_offset=loss_offset,
+            loss_depth=loss_depth,
+            loss_size=loss_size,
+            loss_rotsin=loss_rotsin,
+            loss_centerness=loss_centerness)
+
+        if loss_velo is not None:
+            loss_dict['loss_velo'] = loss_velo
+
+        if loss_dir is not None:
+            loss_dict['loss_dir'] = loss_dir
+
+        if loss_attr is not None:
+            loss_dict['loss_attr'] = loss_attr
+
+        return loss_dict
+
+    @force_fp32(
+        apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds',
+                  'centernesses'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   dir_cls_preds,
+                   attr_preds,
+                   centernesses,
+                   img_metas,
+                   cfg=None,
+                   rescale=None):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W)
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            attr_preds (list[Tensor]): Attribute scores for each scale level
+                Has shape (N, num_points * num_attrs, H, W)
+            centernesses (list[Tensor]): Centerness for each scale level with
+                shape (N, num_points * 1, H, W)
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class label of
+                the corresponding box.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
+            len(centernesses) == len(attr_preds)
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                      bbox_preds[0].device)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            if self.use_direction_classifier:
+                dir_cls_pred_list = [
+                    dir_cls_preds[i][img_id].detach()
+                    for i in range(num_levels)
+                ]
+            else:
+                dir_cls_pred_list = [
+                    cls_scores[i][img_id].new_full(
+                        [2, *cls_scores[i][img_id].shape[1:]], 0).detach()
+                    for i in range(num_levels)
+                ]
+            if self.pred_attrs:
+                attr_pred_list = [
+                    attr_preds[i][img_id].detach() for i in range(num_levels)
+                ]
+            else:
+                attr_pred_list = [
+                    cls_scores[i][img_id].new_full(
+                        [self.num_attrs, *cls_scores[i][img_id].shape[1:]],
+                        self.attr_background_label).detach()
+                    for i in range(num_levels)
+                ]
+            centerness_pred_list = [
+                centernesses[i][img_id].detach() for i in range(num_levels)
+            ]
+            input_meta = img_metas[img_id]
+            det_bboxes = self._get_bboxes_single(
+                cls_score_list, bbox_pred_list, dir_cls_pred_list,
+                attr_pred_list, centerness_pred_list, mlvl_points, input_meta,
+                cfg, rescale)
+            result_list.append(det_bboxes)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           dir_cls_preds,
+                           attr_preds,
+                           centernesses,
+                           mlvl_points,
+                           input_meta,
+                           cfg,
+                           rescale=False):
+        """Transform outputs for a single batch item into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for a single scale level
+                Has shape (num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for a single scale
+                level with shape (num_points * bbox_code_size, H, W).
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on a single scale level with shape
+                (num_points * 2, H, W)
+            attr_preds (list[Tensor]): Attribute scores for each scale level
+                Has shape (N, num_points * num_attrs, H, W)
+            centernesses (list[Tensor]): Centerness for a single scale level
+                with shape (num_points, H, W).
+            mlvl_points (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_points, 2).
+            input_meta (dict): Metadata of input image.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+
+        Returns:
+            tuples[Tensor]: Predicted 3D boxes, scores, labels and attributes.
+        """
+        view = np.array(input_meta['cam2img'])
+        scale_factor = input_meta['scale_factor']
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
+        mlvl_centers2d = []
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        mlvl_attr_scores = []
+        mlvl_centerness = []
+
+        for cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \
+                points in zip(cls_scores, bbox_preds, dir_cls_preds,
+                              attr_preds, centernesses, mlvl_points):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+            attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs)
+            attr_score = torch.max(attr_pred, dim=-1)[1]
+            centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1,
+                                                     sum(self.group_reg_dims))
+            bbox_pred = bbox_pred[:, :self.bbox_code_size]
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                max_scores, _ = (scores * centerness[:, None]).max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                points = points[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_pred = dir_cls_pred[topk_inds, :]
+                centerness = centerness[topk_inds]
+                dir_cls_score = dir_cls_score[topk_inds]
+                attr_score = attr_score[topk_inds]
+            # change the offset to actual center predictions
+            bbox_pred[:, :2] = points - bbox_pred[:, :2]
+            if rescale:
+                bbox_pred[:, :2] /= bbox_pred[:, :2].new_tensor(scale_factor)
+            pred_center2d = bbox_pred[:, :3].clone()
+            bbox_pred[:, :3] = points_img2cam(bbox_pred[:, :3], view)
+            mlvl_centers2d.append(pred_center2d)
+            mlvl_bboxes.append(bbox_pred)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+            mlvl_attr_scores.append(attr_score)
+            mlvl_centerness.append(centerness)
+
+        mlvl_centers2d = torch.cat(mlvl_centers2d)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+
+        # change local yaw to global yaw for 3D nms
+        cam2img = mlvl_centers2d.new_zeros((4, 4))
+        cam2img[:view.shape[0], :view.shape[1]] = \
+            mlvl_centers2d.new_tensor(view)
+        mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d,
+                                                 mlvl_dir_scores,
+                                                 self.dir_offset, cam2img)
+
+        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            mlvl_bboxes, box_dim=self.bbox_code_size,
+            origin=(0.5, 0.5, 0.5)).bev)
+
+        mlvl_scores = torch.cat(mlvl_scores)
+        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        mlvl_attr_scores = torch.cat(mlvl_attr_scores)
+        mlvl_centerness = torch.cat(mlvl_centerness)
+        # no scale_factors in box3d_multiclass_nms
+        # Then we multiply it from outside
+        mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None]
+        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                       mlvl_nms_scores, cfg.score_thr,
+                                       cfg.max_per_img, cfg, mlvl_dir_scores,
+                                       mlvl_attr_scores)
+        bboxes, scores, labels, dir_scores, attrs = results
+        attrs = attrs.to(labels.dtype)  # change data type to int
+        bboxes = input_meta['box_type_3d'](
+            bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
+        # Note that the predictions use origin (0.5, 0.5, 0.5)
+        # Due to the ground truth centers2d are the gravity center of objects
+        # v0.10.0 fix inplace operation to the input tensor of cam_box3d
+        # So here we also need to add origin=(0.5, 0.5, 0.5)
+        if not self.pred_attrs:
+            attrs = None
+
+        return bboxes, scores, labels, attrs
+
+    @staticmethod
+    def pts2Dto3D(points, view):
+        """
+        Args:
+            points (torch.Tensor): points in 2D images, [N, 3],
+                3 corresponds with x, y in the image and depth.
+            view (np.ndarray): camera intrinsic, [3, 3]
+
+        Returns:
+            torch.Tensor: points in 3D space. [N, 3],
+                3 corresponds with x, y, z in 3D space.
+        """
+        warning.warn('DeprecationWarning: This static method has been moved '
+                     'out of this class to mmdet3d/core. The function '
+                     'pts2Dto3D will be deprecated.')
+
+        assert view.shape[0] <= 4
+        assert view.shape[1] <= 4
+        assert points.shape[1] == 3
+
+        points2D = points[:, :2]
+        depths = points[:, 2].view(-1, 1)
+        unnorm_points2D = torch.cat([points2D * depths, depths], dim=1)
+
+        viewpad = torch.eye(4, dtype=points2D.dtype, device=points2D.device)
+        viewpad[:view.shape[0], :view.shape[1]] = points2D.new_tensor(view)
+        inv_viewpad = torch.inverse(viewpad).transpose(0, 1)
+
+        # Do operation in homogeneous coordinates.
+        nbr_points = unnorm_points2D.shape[0]
+        homo_points2D = torch.cat(
+            [unnorm_points2D,
+             points2D.new_ones((nbr_points, 1))], dim=1)
+        points3D = torch.mm(homo_points2D, inv_viewpad)[:, :3]
+
+        return points3D
+
+    def _get_points_single(self,
+                           featmap_size,
+                           stride,
+                           dtype,
+                           device,
+                           flatten=False):
+        """Get points according to feature map sizes."""
+        y, x = super()._get_points_single(featmap_size, stride, dtype, device)
+        points = torch.stack((x.reshape(-1) * stride, y.reshape(-1) * stride),
+                             dim=-1) + stride // 2
+        return points
+
+    def get_targets(self, points, gt_bboxes_list, gt_labels_list,
+                    gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
+                    depths_list, attr_labels_list):
+        """Compute regression, classification and centerss targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+            gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
+                image, each has shape (num_gt, bbox_code_size).
+            gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
+                box, each has shape (num_gt,).
+            centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
+                each has shape (num_gt, 2).
+            depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
+                image, each has shape (num_gt, 1).
+            attr_labels_list (list[Tensor]): Attribute labels of each box,
+                each has shape (num_gt,).
+
+        Returns:
+            tuple:
+                concat_lvl_labels (list[Tensor]): Labels of each level.
+                concat_lvl_bbox_targets (list[Tensor]): BBox targets of each
+                    level.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        if attr_labels_list is None:
+            attr_labels_list = [
+                gt_labels.new_full(gt_labels.shape, self.attr_background_label)
+                for gt_labels in gt_labels_list
+            ]
+
+        # get labels and bbox_targets of each image
+        _, _, labels_3d_list, bbox_targets_3d_list, centerness_targets_list, \
+            attr_targets_list = multi_apply(
+                self._get_target_single,
+                gt_bboxes_list,
+                gt_labels_list,
+                gt_bboxes_3d_list,
+                gt_labels_3d_list,
+                centers2d_list,
+                depths_list,
+                attr_labels_list,
+                points=concat_points,
+                regress_ranges=concat_regress_ranges,
+                num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        labels_3d_list = [
+            labels_3d.split(num_points, 0) for labels_3d in labels_3d_list
+        ]
+        bbox_targets_3d_list = [
+            bbox_targets_3d.split(num_points, 0)
+            for bbox_targets_3d in bbox_targets_3d_list
+        ]
+        centerness_targets_list = [
+            centerness_targets.split(num_points, 0)
+            for centerness_targets in centerness_targets_list
+        ]
+        attr_targets_list = [
+            attr_targets.split(num_points, 0)
+            for attr_targets in attr_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels_3d = []
+        concat_lvl_bbox_targets_3d = []
+        concat_lvl_centerness_targets = []
+        concat_lvl_attr_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels_3d.append(
+                torch.cat([labels[i] for labels in labels_3d_list]))
+            concat_lvl_centerness_targets.append(
+                torch.cat([
+                    centerness_targets[i]
+                    for centerness_targets in centerness_targets_list
+                ]))
+            bbox_targets_3d = torch.cat([
+                bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list
+            ])
+            concat_lvl_attr_targets.append(
+                torch.cat(
+                    [attr_targets[i] for attr_targets in attr_targets_list]))
+            if self.norm_on_bbox:
+                bbox_targets_3d[:, :
+                                2] = bbox_targets_3d[:, :2] / self.strides[i]
+            concat_lvl_bbox_targets_3d.append(bbox_targets_3d)
+        return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \
+            concat_lvl_centerness_targets, concat_lvl_attr_targets
+
+    def _get_target_single(self, gt_bboxes, gt_labels, gt_bboxes_3d,
+                           gt_labels_3d, centers2d, depths, attr_labels,
+                           points, regress_ranges, num_points_per_lvl):
+        """Compute regression and classification targets for a single image."""
+        num_points = points.size(0)
+        num_gts = gt_labels.size(0)
+        if not isinstance(gt_bboxes_3d, torch.Tensor):
+            gt_bboxes_3d = gt_bboxes_3d.tensor.to(gt_bboxes.device)
+        if num_gts == 0:
+            return gt_labels.new_full((num_points,), self.background_label), \
+                   gt_bboxes.new_zeros((num_points, 4)), \
+                   gt_labels_3d.new_full(
+                       (num_points,), self.background_label), \
+                   gt_bboxes_3d.new_zeros((num_points, self.bbox_code_size)), \
+                   gt_bboxes_3d.new_zeros((num_points,)), \
+                   attr_labels.new_full(
+                       (num_points,), self.attr_background_label)
+
+        # change orientation to local yaw
+        gt_bboxes_3d[..., 6] = -torch.atan2(
+            gt_bboxes_3d[..., 0], gt_bboxes_3d[..., 2]) + gt_bboxes_3d[..., 6]
+
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
+        areas = areas[None].repeat(num_points, 1)
+        regress_ranges = regress_ranges[:, None, :].expand(
+            num_points, num_gts, 2)
+        gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4)
+        centers2d = centers2d[None].expand(num_points, num_gts, 2)
+        gt_bboxes_3d = gt_bboxes_3d[None].expand(num_points, num_gts,
+                                                 self.bbox_code_size)
+        depths = depths[None, :, None].expand(num_points, num_gts, 1)
+        xs, ys = points[:, 0], points[:, 1]
+        xs = xs[:, None].expand(num_points, num_gts)
+        ys = ys[:, None].expand(num_points, num_gts)
+
+        delta_xs = (xs - centers2d[..., 0])[..., None]
+        delta_ys = (ys - centers2d[..., 1])[..., None]
+        bbox_targets_3d = torch.cat(
+            (delta_xs, delta_ys, depths, gt_bboxes_3d[..., 3:]), dim=-1)
+
+        left = xs - gt_bboxes[..., 0]
+        right = gt_bboxes[..., 2] - xs
+        top = ys - gt_bboxes[..., 1]
+        bottom = gt_bboxes[..., 3] - ys
+        bbox_targets = torch.stack((left, top, right, bottom), -1)
+
+        assert self.center_sampling is True, 'Setting center_sampling to '\
+            'False has not been implemented for FCOS3D.'
+        # condition1: inside a `center bbox`
+        radius = self.center_sample_radius
+        center_xs = centers2d[..., 0]
+        center_ys = centers2d[..., 1]
+        center_gts = torch.zeros_like(gt_bboxes)
+        stride = center_xs.new_zeros(center_xs.shape)
+
+        # project the points on current lvl back to the `original` sizes
+        lvl_begin = 0
+        for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl):
+            lvl_end = lvl_begin + num_points_lvl
+            stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius
+            lvl_begin = lvl_end
+
+        center_gts[..., 0] = center_xs - stride
+        center_gts[..., 1] = center_ys - stride
+        center_gts[..., 2] = center_xs + stride
+        center_gts[..., 3] = center_ys + stride
+
+        cb_dist_left = xs - center_gts[..., 0]
+        cb_dist_right = center_gts[..., 2] - xs
+        cb_dist_top = ys - center_gts[..., 1]
+        cb_dist_bottom = center_gts[..., 3] - ys
+        center_bbox = torch.stack(
+            (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1)
+        inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+
+        # condition2: limit the regression range for each location
+        max_regress_distance = bbox_targets.max(-1)[0]
+        inside_regress_range = (
+            (max_regress_distance >= regress_ranges[..., 0])
+            & (max_regress_distance <= regress_ranges[..., 1]))
+
+        # center-based criterion to deal with ambiguity
+        dists = torch.sqrt(torch.sum(bbox_targets_3d[..., :2]**2, dim=-1))
+        dists[inside_gt_bbox_mask == 0] = INF
+        dists[inside_regress_range == 0] = INF
+        min_dist, min_dist_inds = dists.min(dim=1)
+
+        labels = gt_labels[min_dist_inds]
+        labels_3d = gt_labels_3d[min_dist_inds]
+        attr_labels = attr_labels[min_dist_inds]
+        labels[min_dist == INF] = self.background_label  # set as BG
+        labels_3d[min_dist == INF] = self.background_label  # set as BG
+        attr_labels[min_dist == INF] = self.attr_background_label
+
+        bbox_targets = bbox_targets[range(num_points), min_dist_inds]
+        bbox_targets_3d = bbox_targets_3d[range(num_points), min_dist_inds]
+        relative_dists = torch.sqrt(
+            torch.sum(bbox_targets_3d[..., :2]**2,
+                      dim=-1)) / (1.414 * stride[:, 0])
+        # [N, 1] / [N, 1]
+        centerness_targets = torch.exp(-self.centerness_alpha * relative_dists)
+
+        return labels, bbox_targets, labels_3d, bbox_targets_3d, \
+            centerness_targets, attr_labels
diff --git a/mmdet3d/models/dense_heads/free_anchor3d_head.py b/mmdet3d/models/dense_heads/free_anchor3d_head.py
index a56f2c7..b0d9502 100644
--- a/mmdet3d/models/dense_heads/free_anchor3d_head.py
+++ b/mmdet3d/models/dense_heads/free_anchor3d_head.py
@@ -1,285 +1,285 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.runner import force_fp32
-from torch.nn import functional as F
-
-from mmdet3d.core.bbox import bbox_overlaps_nearest_3d
-from ..builder import HEADS
-from .anchor3d_head import Anchor3DHead
-from .train_mixins import get_direction_target
-
-
-@HEADS.register_module()
-class FreeAnchor3DHead(Anchor3DHead):
-    r"""`FreeAnchor <https://arxiv.org/abs/1909.02466>`_ head for 3D detection.
-
-    Note:
-        This implementation is directly modified from the `mmdet implementation
-        <https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/free_anchor_retina_head.py>`_.
-        We find it also works on 3D detection with minor modification, i.e.,
-        different hyper-parameters and a additional direction classifier.
-
-    Args:
-        pre_anchor_topk (int): Number of boxes that be token in each bag.
-        bbox_thr (float): The threshold of the saturated linear function. It is
-            usually the same with the IoU threshold used in NMS.
-        gamma (float): Gamma parameter in focal loss.
-        alpha (float): Alpha parameter in focal loss.
-        kwargs (dict): Other arguments are the same as those in :class:`Anchor3DHead`.
-    """  # noqa: E501
-
-    def __init__(self,
-                 pre_anchor_topk=50,
-                 bbox_thr=0.6,
-                 gamma=2.0,
-                 alpha=0.5,
-                 init_cfg=None,
-                 **kwargs):
-        super().__init__(init_cfg=init_cfg, **kwargs)
-        self.pre_anchor_topk = pre_anchor_topk
-        self.bbox_thr = bbox_thr
-        self.gamma = gamma
-        self.alpha = alpha
-
-    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             dir_cls_preds,
-             gt_bboxes,
-             gt_labels,
-             input_metas,
-             gt_bboxes_ignore=None):
-        """Calculate loss of FreeAnchor head.
-
-        Args:
-            cls_scores (list[torch.Tensor]): Classification scores of
-                different samples.
-            bbox_preds (list[torch.Tensor]): Box predictions of
-                different samples
-            dir_cls_preds (list[torch.Tensor]): Direction predictions of
-                different samples
-            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes.
-            gt_labels (list[torch.Tensor]): Ground truth labels.
-            input_metas (list[dict]): List of input meta information.
-            gt_bboxes_ignore (list[:obj:`BaseInstance3DBoxes`], optional):
-                Ground truth boxes that should be ignored. Defaults to None.
-
-        Returns:
-            dict[str, torch.Tensor]: Loss items.
-
-                - positive_bag_loss (torch.Tensor): Loss of positive samples.
-                - negative_bag_loss (torch.Tensor): Loss of negative samples.
-        """
-        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
-        assert len(featmap_sizes) == self.anchor_generator.num_levels
-
-        anchor_list = self.get_anchors(featmap_sizes, input_metas)
-        anchors = [torch.cat(anchor) for anchor in anchor_list]
-
-        # concatenate each level
-        cls_scores = [
-            cls_score.permute(0, 2, 3, 1).reshape(
-                cls_score.size(0), -1, self.num_classes)
-            for cls_score in cls_scores
-        ]
-        bbox_preds = [
-            bbox_pred.permute(0, 2, 3, 1).reshape(
-                bbox_pred.size(0), -1, self.box_code_size)
-            for bbox_pred in bbox_preds
-        ]
-        dir_cls_preds = [
-            dir_cls_pred.permute(0, 2, 3,
-                                 1).reshape(dir_cls_pred.size(0), -1, 2)
-            for dir_cls_pred in dir_cls_preds
-        ]
-
-        cls_scores = torch.cat(cls_scores, dim=1)
-        bbox_preds = torch.cat(bbox_preds, dim=1)
-        dir_cls_preds = torch.cat(dir_cls_preds, dim=1)
-
-        cls_prob = torch.sigmoid(cls_scores)
-        box_prob = []
-        num_pos = 0
-        positive_losses = []
-        for _, (anchors_, gt_labels_, gt_bboxes_, cls_prob_, bbox_preds_,
-                dir_cls_preds_) in enumerate(
-                    zip(anchors, gt_labels, gt_bboxes, cls_prob, bbox_preds,
-                        dir_cls_preds)):
-
-            gt_bboxes_ = gt_bboxes_.tensor.to(anchors_.device)
-
-            with torch.no_grad():
-                # box_localization: a_{j}^{loc}, shape: [j, 4]
-                pred_boxes = self.bbox_coder.decode(anchors_, bbox_preds_)
-
-                # object_box_iou: IoU_{ij}^{loc}, shape: [i, j]
-                object_box_iou = bbox_overlaps_nearest_3d(
-                    gt_bboxes_, pred_boxes)
-
-                # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j]
-                t1 = self.bbox_thr
-                t2 = object_box_iou.max(
-                    dim=1, keepdim=True).values.clamp(min=t1 + 1e-6)
-                object_box_prob = ((object_box_iou - t1) / (t2 - t1)).clamp(
-                    min=0, max=1)
-
-                # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j]
-                num_obj = gt_labels_.size(0)
-                indices = torch.stack(
-                    [torch.arange(num_obj).type_as(gt_labels_), gt_labels_],
-                    dim=0)
-
-                object_cls_box_prob = torch.sparse_coo_tensor(
-                    indices, object_box_prob)
-
-                # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j]
-                """
-                from "start" to "end" implement:
-                image_box_iou = torch.sparse.max(object_cls_box_prob,
-                                                 dim=0).t()
-
-                """
-                # start
-                box_cls_prob = torch.sparse.sum(
-                    object_cls_box_prob, dim=0).to_dense()
-
-                indices = torch.nonzero(box_cls_prob, as_tuple=False).t_()
-                if indices.numel() == 0:
-                    image_box_prob = torch.zeros(
-                        anchors_.size(0),
-                        self.num_classes).type_as(object_box_prob)
-                else:
-                    nonzero_box_prob = torch.where(
-                        (gt_labels_.unsqueeze(dim=-1) == indices[0]),
-                        object_box_prob[:, indices[1]],
-                        torch.tensor(
-                            [0]).type_as(object_box_prob)).max(dim=0).values
-
-                    # upmap to shape [j, c]
-                    image_box_prob = torch.sparse_coo_tensor(
-                        indices.flip([0]),
-                        nonzero_box_prob,
-                        size=(anchors_.size(0), self.num_classes)).to_dense()
-                # end
-
-                box_prob.append(image_box_prob)
-
-            # construct bags for objects
-            match_quality_matrix = bbox_overlaps_nearest_3d(
-                gt_bboxes_, anchors_)
-            _, matched = torch.topk(
-                match_quality_matrix,
-                self.pre_anchor_topk,
-                dim=1,
-                sorted=False)
-            del match_quality_matrix
-
-            # matched_cls_prob: P_{ij}^{cls}
-            matched_cls_prob = torch.gather(
-                cls_prob_[matched], 2,
-                gt_labels_.view(-1, 1, 1).repeat(1, self.pre_anchor_topk,
-                                                 1)).squeeze(2)
-
-            # matched_box_prob: P_{ij}^{loc}
-            matched_anchors = anchors_[matched]
-            matched_object_targets = self.bbox_coder.encode(
-                matched_anchors,
-                gt_bboxes_.unsqueeze(dim=1).expand_as(matched_anchors))
-
-            # direction classification loss
-            loss_dir = None
-            if self.use_direction_classifier:
-                # also calculate direction prob: P_{ij}^{dir}
-                matched_dir_targets = get_direction_target(
-                    matched_anchors,
-                    matched_object_targets,
-                    self.dir_offset,
-                    self.dir_limit_offset,
-                    one_hot=False)
-                loss_dir = self.loss_dir(
-                    dir_cls_preds_[matched].transpose(-2, -1),
-                    matched_dir_targets,
-                    reduction_override='none')
-
-            # generate bbox weights
-            if self.diff_rad_by_sin:
-                bbox_preds_[matched], matched_object_targets = \
-                    self.add_sin_difference(
-                        bbox_preds_[matched], matched_object_targets)
-            bbox_weights = matched_anchors.new_ones(matched_anchors.size())
-            # Use pop is not right, check performance
-            code_weight = self.train_cfg.get('code_weight', None)
-            if code_weight:
-                bbox_weights = bbox_weights * bbox_weights.new_tensor(
-                    code_weight)
-            loss_bbox = self.loss_bbox(
-                bbox_preds_[matched],
-                matched_object_targets,
-                bbox_weights,
-                reduction_override='none').sum(-1)
-
-            if loss_dir is not None:
-                loss_bbox += loss_dir
-            matched_box_prob = torch.exp(-loss_bbox)
-
-            # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )}
-            num_pos += len(gt_bboxes_)
-            positive_losses.append(
-                self.positive_bag_loss(matched_cls_prob, matched_box_prob))
-
-        positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos)
-
-        # box_prob: P{a_{j} \in A_{+}}
-        box_prob = torch.stack(box_prob, dim=0)
-
-        # negative_loss:
-        # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B||
-        negative_loss = self.negative_bag_loss(cls_prob, box_prob).sum() / max(
-            1, num_pos * self.pre_anchor_topk)
-
-        losses = {
-            'positive_bag_loss': positive_loss,
-            'negative_bag_loss': negative_loss
-        }
-        return losses
-
-    def positive_bag_loss(self, matched_cls_prob, matched_box_prob):
-        """Generate positive bag loss.
-
-        Args:
-            matched_cls_prob (torch.Tensor): Classification probability
-                of matched positive samples.
-            matched_box_prob (torch.Tensor): Bounding box probability
-                of matched positive samples.
-
-        Returns:
-            torch.Tensor: Loss of positive samples.
-        """
-        # bag_prob = Mean-max(matched_prob)
-        matched_prob = matched_cls_prob * matched_box_prob
-        weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None)
-        weight /= weight.sum(dim=1).unsqueeze(dim=-1)
-        bag_prob = (weight * matched_prob).sum(dim=1)
-        # positive_bag_loss = -self.alpha * log(bag_prob)
-        bag_prob = bag_prob.clamp(0, 1)  # to avoid bug of BCE, check
-        return self.alpha * F.binary_cross_entropy(
-            bag_prob, torch.ones_like(bag_prob), reduction='none')
-
-    def negative_bag_loss(self, cls_prob, box_prob):
-        """Generate negative bag loss.
-
-        Args:
-            cls_prob (torch.Tensor): Classification probability
-                of negative samples.
-            box_prob (torch.Tensor): Bounding box probability
-                of negative samples.
-
-        Returns:
-            torch.Tensor: Loss of negative samples.
-        """
-        prob = cls_prob * (1 - box_prob)
-        prob = prob.clamp(0, 1)  # to avoid bug of BCE, check
-        negative_bag_loss = prob**self.gamma * F.binary_cross_entropy(
-            prob, torch.zeros_like(prob), reduction='none')
-        return (1 - self.alpha) * negative_bag_loss
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import force_fp32
+from torch.nn import functional as F
+
+from mmdet3d.core.bbox import bbox_overlaps_nearest_3d
+from ..builder import HEADS
+from .anchor3d_head import Anchor3DHead
+from .train_mixins import get_direction_target
+
+
+@HEADS.register_module()
+class FreeAnchor3DHead(Anchor3DHead):
+    r"""`FreeAnchor <https://arxiv.org/abs/1909.02466>`_ head for 3D detection.
+
+    Note:
+        This implementation is directly modified from the `mmdet implementation
+        <https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/free_anchor_retina_head.py>`_.
+        We find it also works on 3D detection with minor modification, i.e.,
+        different hyper-parameters and a additional direction classifier.
+
+    Args:
+        pre_anchor_topk (int): Number of boxes that be token in each bag.
+        bbox_thr (float): The threshold of the saturated linear function. It is
+            usually the same with the IoU threshold used in NMS.
+        gamma (float): Gamma parameter in focal loss.
+        alpha (float): Alpha parameter in focal loss.
+        kwargs (dict): Other arguments are the same as those in :class:`Anchor3DHead`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 pre_anchor_topk=50,
+                 bbox_thr=0.6,
+                 gamma=2.0,
+                 alpha=0.5,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg, **kwargs)
+        self.pre_anchor_topk = pre_anchor_topk
+        self.bbox_thr = bbox_thr
+        self.gamma = gamma
+        self.alpha = alpha
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             dir_cls_preds,
+             gt_bboxes,
+             gt_labels,
+             input_metas,
+             gt_bboxes_ignore=None):
+        """Calculate loss of FreeAnchor head.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Classification scores of
+                different samples.
+            bbox_preds (list[torch.Tensor]): Box predictions of
+                different samples
+            dir_cls_preds (list[torch.Tensor]): Direction predictions of
+                different samples
+            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes.
+            gt_labels (list[torch.Tensor]): Ground truth labels.
+            input_metas (list[dict]): List of input meta information.
+            gt_bboxes_ignore (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth boxes that should be ignored. Defaults to None.
+
+        Returns:
+            dict[str, torch.Tensor]: Loss items.
+
+                - positive_bag_loss (torch.Tensor): Loss of positive samples.
+                - negative_bag_loss (torch.Tensor): Loss of negative samples.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        anchor_list = self.get_anchors(featmap_sizes, input_metas)
+        anchors = [torch.cat(anchor) for anchor in anchor_list]
+
+        # concatenate each level
+        cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(
+                cls_score.size(0), -1, self.num_classes)
+            for cls_score in cls_scores
+        ]
+        bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(
+                bbox_pred.size(0), -1, self.box_code_size)
+            for bbox_pred in bbox_preds
+        ]
+        dir_cls_preds = [
+            dir_cls_pred.permute(0, 2, 3,
+                                 1).reshape(dir_cls_pred.size(0), -1, 2)
+            for dir_cls_pred in dir_cls_preds
+        ]
+
+        cls_scores = torch.cat(cls_scores, dim=1)
+        bbox_preds = torch.cat(bbox_preds, dim=1)
+        dir_cls_preds = torch.cat(dir_cls_preds, dim=1)
+
+        cls_prob = torch.sigmoid(cls_scores)
+        box_prob = []
+        num_pos = 0
+        positive_losses = []
+        for _, (anchors_, gt_labels_, gt_bboxes_, cls_prob_, bbox_preds_,
+                dir_cls_preds_) in enumerate(
+                    zip(anchors, gt_labels, gt_bboxes, cls_prob, bbox_preds,
+                        dir_cls_preds)):
+
+            gt_bboxes_ = gt_bboxes_.tensor.to(anchors_.device)
+
+            with torch.no_grad():
+                # box_localization: a_{j}^{loc}, shape: [j, 4]
+                pred_boxes = self.bbox_coder.decode(anchors_, bbox_preds_)
+
+                # object_box_iou: IoU_{ij}^{loc}, shape: [i, j]
+                object_box_iou = bbox_overlaps_nearest_3d(
+                    gt_bboxes_, pred_boxes)
+
+                # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j]
+                t1 = self.bbox_thr
+                t2 = object_box_iou.max(
+                    dim=1, keepdim=True).values.clamp(min=t1 + 1e-6)
+                object_box_prob = ((object_box_iou - t1) / (t2 - t1)).clamp(
+                    min=0, max=1)
+
+                # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j]
+                num_obj = gt_labels_.size(0)
+                indices = torch.stack(
+                    [torch.arange(num_obj).type_as(gt_labels_), gt_labels_],
+                    dim=0)
+
+                object_cls_box_prob = torch.sparse_coo_tensor(
+                    indices, object_box_prob)
+
+                # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j]
+                """
+                from "start" to "end" implement:
+                image_box_iou = torch.sparse.max(object_cls_box_prob,
+                                                 dim=0).t()
+
+                """
+                # start
+                box_cls_prob = torch.sparse.sum(
+                    object_cls_box_prob, dim=0).to_dense()
+
+                indices = torch.nonzero(box_cls_prob, as_tuple=False).t_()
+                if indices.numel() == 0:
+                    image_box_prob = torch.zeros(
+                        anchors_.size(0),
+                        self.num_classes).type_as(object_box_prob)
+                else:
+                    nonzero_box_prob = torch.where(
+                        (gt_labels_.unsqueeze(dim=-1) == indices[0]),
+                        object_box_prob[:, indices[1]],
+                        torch.tensor(
+                            [0]).type_as(object_box_prob)).max(dim=0).values
+
+                    # upmap to shape [j, c]
+                    image_box_prob = torch.sparse_coo_tensor(
+                        indices.flip([0]),
+                        nonzero_box_prob,
+                        size=(anchors_.size(0), self.num_classes)).to_dense()
+                # end
+
+                box_prob.append(image_box_prob)
+
+            # construct bags for objects
+            match_quality_matrix = bbox_overlaps_nearest_3d(
+                gt_bboxes_, anchors_)
+            _, matched = torch.topk(
+                match_quality_matrix,
+                self.pre_anchor_topk,
+                dim=1,
+                sorted=False)
+            del match_quality_matrix
+
+            # matched_cls_prob: P_{ij}^{cls}
+            matched_cls_prob = torch.gather(
+                cls_prob_[matched], 2,
+                gt_labels_.view(-1, 1, 1).repeat(1, self.pre_anchor_topk,
+                                                 1)).squeeze(2)
+
+            # matched_box_prob: P_{ij}^{loc}
+            matched_anchors = anchors_[matched]
+            matched_object_targets = self.bbox_coder.encode(
+                matched_anchors,
+                gt_bboxes_.unsqueeze(dim=1).expand_as(matched_anchors))
+
+            # direction classification loss
+            loss_dir = None
+            if self.use_direction_classifier:
+                # also calculate direction prob: P_{ij}^{dir}
+                matched_dir_targets = get_direction_target(
+                    matched_anchors,
+                    matched_object_targets,
+                    self.dir_offset,
+                    self.dir_limit_offset,
+                    one_hot=False)
+                loss_dir = self.loss_dir(
+                    dir_cls_preds_[matched].transpose(-2, -1),
+                    matched_dir_targets,
+                    reduction_override='none')
+
+            # generate bbox weights
+            if self.diff_rad_by_sin:
+                bbox_preds_[matched], matched_object_targets = \
+                    self.add_sin_difference(
+                        bbox_preds_[matched], matched_object_targets)
+            bbox_weights = matched_anchors.new_ones(matched_anchors.size())
+            # Use pop is not right, check performance
+            code_weight = self.train_cfg.get('code_weight', None)
+            if code_weight:
+                bbox_weights = bbox_weights * bbox_weights.new_tensor(
+                    code_weight)
+            loss_bbox = self.loss_bbox(
+                bbox_preds_[matched],
+                matched_object_targets,
+                bbox_weights,
+                reduction_override='none').sum(-1)
+
+            if loss_dir is not None:
+                loss_bbox += loss_dir
+            matched_box_prob = torch.exp(-loss_bbox)
+
+            # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )}
+            num_pos += len(gt_bboxes_)
+            positive_losses.append(
+                self.positive_bag_loss(matched_cls_prob, matched_box_prob))
+
+        positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos)
+
+        # box_prob: P{a_{j} \in A_{+}}
+        box_prob = torch.stack(box_prob, dim=0)
+
+        # negative_loss:
+        # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B||
+        negative_loss = self.negative_bag_loss(cls_prob, box_prob).sum() / max(
+            1, num_pos * self.pre_anchor_topk)
+
+        losses = {
+            'positive_bag_loss': positive_loss,
+            'negative_bag_loss': negative_loss
+        }
+        return losses
+
+    def positive_bag_loss(self, matched_cls_prob, matched_box_prob):
+        """Generate positive bag loss.
+
+        Args:
+            matched_cls_prob (torch.Tensor): Classification probability
+                of matched positive samples.
+            matched_box_prob (torch.Tensor): Bounding box probability
+                of matched positive samples.
+
+        Returns:
+            torch.Tensor: Loss of positive samples.
+        """
+        # bag_prob = Mean-max(matched_prob)
+        matched_prob = matched_cls_prob * matched_box_prob
+        weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None)
+        weight /= weight.sum(dim=1).unsqueeze(dim=-1)
+        bag_prob = (weight * matched_prob).sum(dim=1)
+        # positive_bag_loss = -self.alpha * log(bag_prob)
+        bag_prob = bag_prob.clamp(0, 1)  # to avoid bug of BCE, check
+        return self.alpha * F.binary_cross_entropy(
+            bag_prob, torch.ones_like(bag_prob), reduction='none')
+
+    def negative_bag_loss(self, cls_prob, box_prob):
+        """Generate negative bag loss.
+
+        Args:
+            cls_prob (torch.Tensor): Classification probability
+                of negative samples.
+            box_prob (torch.Tensor): Bounding box probability
+                of negative samples.
+
+        Returns:
+            torch.Tensor: Loss of negative samples.
+        """
+        prob = cls_prob * (1 - box_prob)
+        prob = prob.clamp(0, 1)  # to avoid bug of BCE, check
+        negative_bag_loss = prob**self.gamma * F.binary_cross_entropy(
+            prob, torch.zeros_like(prob), reduction='none')
+        return (1 - self.alpha) * negative_bag_loss
diff --git a/mmdet3d/models/dense_heads/groupfree3d_head.py b/mmdet3d/models/dense_heads/groupfree3d_head.py
index b76cb05..394c40f 100644
--- a/mmdet3d/models/dense_heads/groupfree3d_head.py
+++ b/mmdet3d/models/dense_heads/groupfree3d_head.py
@@ -1,994 +1,994 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import numpy as np
-import torch
-from mmcv import ConfigDict
-from mmcv.cnn import ConvModule, xavier_init
-from mmcv.cnn.bricks.transformer import (build_positional_encoding,
-                                         build_transformer_layer)
-from mmcv.ops import PointsSampler as Points_Sampler
-from mmcv.ops import gather_points
-from mmcv.runner import BaseModule, force_fp32
-from torch import nn as nn
-from torch.nn import functional as F
-
-from mmdet3d.core.post_processing import aligned_3d_nms
-from mmdet.core import build_bbox_coder, multi_apply
-from ..builder import HEADS, build_loss
-from .base_conv_bbox_head import BaseConvBboxHead
-
-EPS = 1e-6
-
-
-class PointsObjClsModule(BaseModule):
-    """object candidate point prediction from seed point features.
-
-    Args:
-        in_channel (int): number of channels of seed point features.
-        num_convs (int, optional): number of conv layers.
-            Default: 3.
-        conv_cfg (dict, optional): Config of convolution.
-            Default: dict(type='Conv1d').
-        norm_cfg (dict, optional): Config of normalization.
-            Default: dict(type='BN1d').
-        act_cfg (dict, optional): Config of activation.
-            Default: dict(type='ReLU').
-    """
-
-    def __init__(self,
-                 in_channel,
-                 num_convs=3,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        conv_channels = [in_channel for _ in range(num_convs - 1)]
-        conv_channels.append(1)
-
-        self.mlp = nn.Sequential()
-        prev_channels = in_channel
-        for i in range(num_convs):
-            self.mlp.add_module(
-                f'layer{i}',
-                ConvModule(
-                    prev_channels,
-                    conv_channels[i],
-                    1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg if i < num_convs - 1 else None,
-                    act_cfg=act_cfg if i < num_convs - 1 else None,
-                    bias=True,
-                    inplace=True))
-            prev_channels = conv_channels[i]
-
-    def forward(self, seed_features):
-        """Forward pass.
-
-        Args:
-            seed_features (torch.Tensor): seed features, dims:
-                (batch_size, feature_dim, num_seed)
-
-        Returns:
-            torch.Tensor: objectness logits, dim:
-                (batch_size, 1, num_seed)
-        """
-        return self.mlp(seed_features)
-
-
-class GeneralSamplingModule(nn.Module):
-    """Sampling Points.
-
-    Sampling points with given index.
-    """
-
-    def forward(self, xyz, features, sample_inds):
-        """Forward pass.
-
-        Args:
-            xyz： (B, N, 3) the coordinates of the features.
-            features (Tensor): (B, C, N) features to sample.
-            sample_inds (Tensor): (B, M) the given index,
-                where M is the number of points.
-
-        Returns:
-            Tensor: (B, M, 3) coordinates of sampled features
-            Tensor: (B, C, M) the sampled features.
-            Tensor: (B, M) the given index.
-        """
-        xyz_t = xyz.transpose(1, 2).contiguous()
-        new_xyz = gather_points(xyz_t, sample_inds).transpose(1,
-                                                              2).contiguous()
-        new_features = gather_points(features, sample_inds).contiguous()
-
-        return new_xyz, new_features, sample_inds
-
-
-@HEADS.register_module()
-class GroupFree3DHead(BaseModule):
-    r"""Bbox head of `Group-Free 3D <https://arxiv.org/abs/2104.00678>`_.
-
-    Args:
-        num_classes (int): The number of class.
-        in_channels (int): The dims of input features from backbone.
-        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
-            decoding boxes.
-        num_decoder_layers (int): The number of transformer decoder layers.
-        transformerlayers (dict): Config for transformer decoder.
-        train_cfg (dict): Config for training.
-        test_cfg (dict): Config for testing.
-        num_proposal (int): The number of initial sampling candidates.
-        pred_layer_cfg (dict): Config of classfication and regression
-            prediction layers.
-        size_cls_agnostic (bool): Whether the predicted size is class-agnostic.
-        gt_per_seed (int): the number of candidate instance each point belongs
-            to.
-        sampling_objectness_loss (dict): Config of initial sampling
-            objectness loss.
-        objectness_loss (dict): Config of objectness loss.
-        center_loss (dict): Config of center loss.
-        dir_class_loss (dict): Config of direction classification loss.
-        dir_res_loss (dict): Config of direction residual regression loss.
-        size_class_loss (dict): Config of size classification loss.
-        size_res_loss (dict): Config of size residual regression loss.
-        size_reg_loss (dict): Config of class-agnostic size regression loss.
-        semantic_loss (dict): Config of point-wise semantic segmentation loss.
-    """
-
-    def __init__(self,
-                 num_classes,
-                 in_channels,
-                 bbox_coder,
-                 num_decoder_layers,
-                 transformerlayers,
-                 decoder_self_posembeds=dict(
-                     type='ConvBNPositionalEncoding',
-                     input_channel=6,
-                     num_pos_feats=288),
-                 decoder_cross_posembeds=dict(
-                     type='ConvBNPositionalEncoding',
-                     input_channel=3,
-                     num_pos_feats=288),
-                 train_cfg=None,
-                 test_cfg=None,
-                 num_proposal=128,
-                 pred_layer_cfg=None,
-                 size_cls_agnostic=True,
-                 gt_per_seed=3,
-                 sampling_objectness_loss=None,
-                 objectness_loss=None,
-                 center_loss=None,
-                 dir_class_loss=None,
-                 dir_res_loss=None,
-                 size_class_loss=None,
-                 size_res_loss=None,
-                 size_reg_loss=None,
-                 semantic_loss=None,
-                 init_cfg=None):
-        super(GroupFree3DHead, self).__init__(init_cfg=init_cfg)
-        self.num_classes = num_classes
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        self.num_proposal = num_proposal
-        self.in_channels = in_channels
-        self.num_decoder_layers = num_decoder_layers
-        self.size_cls_agnostic = size_cls_agnostic
-        self.gt_per_seed = gt_per_seed
-
-        # Transformer decoder layers
-        if isinstance(transformerlayers, ConfigDict):
-            transformerlayers = [
-                copy.deepcopy(transformerlayers)
-                for _ in range(num_decoder_layers)
-            ]
-        else:
-            assert isinstance(transformerlayers, list) and \
-                   len(transformerlayers) == num_decoder_layers
-        self.decoder_layers = nn.ModuleList()
-        for i in range(self.num_decoder_layers):
-            self.decoder_layers.append(
-                build_transformer_layer(transformerlayers[i]))
-        self.embed_dims = self.decoder_layers[0].embed_dims
-        assert self.embed_dims == decoder_self_posembeds['num_pos_feats']
-        assert self.embed_dims == decoder_cross_posembeds['num_pos_feats']
-
-        # bbox_coder
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-        self.num_sizes = self.bbox_coder.num_sizes
-        self.num_dir_bins = self.bbox_coder.num_dir_bins
-
-        # Initial object candidate sampling
-        self.gsample_module = GeneralSamplingModule()
-        self.fps_module = Points_Sampler([self.num_proposal])
-        self.points_obj_cls = PointsObjClsModule(self.in_channels)
-
-        self.fp16_enabled = False
-
-        # initial candidate prediction
-        self.conv_pred = BaseConvBboxHead(
-            **pred_layer_cfg,
-            num_cls_out_channels=self._get_cls_out_channels(),
-            num_reg_out_channels=self._get_reg_out_channels())
-
-        # query proj and key proj
-        self.decoder_query_proj = nn.Conv1d(
-            self.embed_dims, self.embed_dims, kernel_size=1)
-        self.decoder_key_proj = nn.Conv1d(
-            self.embed_dims, self.embed_dims, kernel_size=1)
-
-        # query position embed
-        self.decoder_self_posembeds = nn.ModuleList()
-        for _ in range(self.num_decoder_layers):
-            self.decoder_self_posembeds.append(
-                build_positional_encoding(decoder_self_posembeds))
-        # key position embed
-        self.decoder_cross_posembeds = nn.ModuleList()
-        for _ in range(self.num_decoder_layers):
-            self.decoder_cross_posembeds.append(
-                build_positional_encoding(decoder_cross_posembeds))
-
-        # Prediction Head
-        self.prediction_heads = nn.ModuleList()
-        for i in range(self.num_decoder_layers):
-            self.prediction_heads.append(
-                BaseConvBboxHead(
-                    **pred_layer_cfg,
-                    num_cls_out_channels=self._get_cls_out_channels(),
-                    num_reg_out_channels=self._get_reg_out_channels()))
-
-        self.sampling_objectness_loss = build_loss(sampling_objectness_loss)
-        self.objectness_loss = build_loss(objectness_loss)
-        self.center_loss = build_loss(center_loss)
-        self.dir_res_loss = build_loss(dir_res_loss)
-        self.dir_class_loss = build_loss(dir_class_loss)
-        self.semantic_loss = build_loss(semantic_loss)
-        if self.size_cls_agnostic:
-            self.size_reg_loss = build_loss(size_reg_loss)
-        else:
-            self.size_res_loss = build_loss(size_res_loss)
-            self.size_class_loss = build_loss(size_class_loss)
-
-    def init_weights(self):
-        """Initialize weights of transformer decoder in GroupFree3DHead."""
-        # initialize transformer
-        for m in self.decoder_layers.parameters():
-            if m.dim() > 1:
-                xavier_init(m, distribution='uniform')
-        for m in self.decoder_self_posembeds.parameters():
-            if m.dim() > 1:
-                xavier_init(m, distribution='uniform')
-        for m in self.decoder_cross_posembeds.parameters():
-            if m.dim() > 1:
-                xavier_init(m, distribution='uniform')
-
-    def _get_cls_out_channels(self):
-        """Return the channel number of classification outputs."""
-        # Class numbers (k) + objectness (1)
-        return self.num_classes + 1
-
-    def _get_reg_out_channels(self):
-        """Return the channel number of regression outputs."""
-        # center residual (3),
-        # heading class+residual (num_dir_bins*2),
-        # size class+residual(num_sizes*4 or 3)
-        if self.size_cls_agnostic:
-            return 6 + self.num_dir_bins * 2
-        else:
-            return 3 + self.num_dir_bins * 2 + self.num_sizes * 4
-
-    def _extract_input(self, feat_dict):
-        """Extract inputs from features dictionary.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-
-        Returns:
-            torch.Tensor: Coordinates of input points.
-            torch.Tensor: Features of input points.
-            torch.Tensor: Indices of input points.
-        """
-
-        seed_points = feat_dict['fp_xyz'][-1]
-        seed_features = feat_dict['fp_features'][-1]
-        seed_indices = feat_dict['fp_indices'][-1]
-
-        return seed_points, seed_features, seed_indices
-
-    def forward(self, feat_dict, sample_mod):
-        """Forward pass.
-
-        Note:
-            The forward of GroupFree3DHead is divided into 2 steps:
-
-                1. Initial object candidates sampling.
-                2. Iterative object box prediction by transformer decoder.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-            sample_mod (str): sample mode for initial candidates sampling.
-
-        Returns:
-            results (dict): Predictions of GroupFree3D head.
-        """
-        assert sample_mod in ['fps', 'kps']
-
-        seed_xyz, seed_features, seed_indices = self._extract_input(feat_dict)
-
-        results = dict(
-            seed_points=seed_xyz,
-            seed_features=seed_features,
-            seed_indices=seed_indices)
-
-        # 1. Initial object candidates sampling.
-        if sample_mod == 'fps':
-            sample_inds = self.fps_module(seed_xyz, seed_features)
-        elif sample_mod == 'kps':
-            points_obj_cls_logits = self.points_obj_cls(
-                seed_features)  # (batch_size, 1, num_seed)
-            points_obj_cls_scores = points_obj_cls_logits.sigmoid().squeeze(1)
-            sample_inds = torch.topk(points_obj_cls_scores,
-                                     self.num_proposal)[1].int()
-            results['seeds_obj_cls_logits'] = points_obj_cls_logits
-        else:
-            raise NotImplementedError(
-                f'Sample mode {sample_mod} is not supported!')
-
-        candidate_xyz, candidate_features, sample_inds = self.gsample_module(
-            seed_xyz, seed_features, sample_inds)
-
-        results['query_points_xyz'] = candidate_xyz  # (B, M, 3)
-        results['query_points_feature'] = candidate_features  # (B, C, M)
-        results['query_points_sample_inds'] = sample_inds.long()  # (B, M)
-
-        prefix = 'proposal.'
-        cls_predictions, reg_predictions = self.conv_pred(candidate_features)
-        decode_res = self.bbox_coder.split_pred(cls_predictions,
-                                                reg_predictions, candidate_xyz,
-                                                prefix)
-
-        results.update(decode_res)
-        bbox3d = self.bbox_coder.decode(results, prefix)
-
-        # 2. Iterative object box prediction by transformer decoder.
-        base_bbox3d = bbox3d[:, :, :6].detach().clone()
-
-        query = self.decoder_query_proj(candidate_features).permute(2, 0, 1)
-        key = self.decoder_key_proj(seed_features).permute(2, 0, 1)
-        value = key
-
-        # transformer decoder
-        results['num_decoder_layers'] = 0
-        for i in range(self.num_decoder_layers):
-            prefix = f's{i}.'
-
-            query_pos = self.decoder_self_posembeds[i](base_bbox3d).permute(
-                2, 0, 1)
-            key_pos = self.decoder_cross_posembeds[i](seed_xyz).permute(
-                2, 0, 1)
-
-            query = self.decoder_layers[i](
-                query, key, value, query_pos=query_pos,
-                key_pos=key_pos).permute(1, 2, 0)
-
-            results[f'{prefix}query'] = query
-
-            cls_predictions, reg_predictions = self.prediction_heads[i](query)
-            decode_res = self.bbox_coder.split_pred(cls_predictions,
-                                                    reg_predictions,
-                                                    candidate_xyz, prefix)
-            # TODO: should save bbox3d instead of decode_res?
-            results.update(decode_res)
-
-            bbox3d = self.bbox_coder.decode(results, prefix)
-            results[f'{prefix}bbox3d'] = bbox3d
-            base_bbox3d = bbox3d[:, :, :6].detach().clone()
-            query = query.permute(2, 0, 1)
-
-            results['num_decoder_layers'] += 1
-
-        return results
-
-    @force_fp32(apply_to=('bbox_preds', ))
-    def loss(self,
-             bbox_preds,
-             points,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             pts_semantic_mask=None,
-             pts_instance_mask=None,
-             img_metas=None,
-             gt_bboxes_ignore=None,
-             ret_target=False):
-        """Compute loss.
-
-        Args:
-            bbox_preds (dict): Predictions from forward of vote head.
-            points (list[torch.Tensor]): Input points.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each sample.
-            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
-            pts_semantic_mask (list[torch.Tensor]): Point-wise
-                semantic mask.
-            pts_instance_mask (list[torch.Tensor]): Point-wise
-                instance mask.
-            img_metas (list[dict]): Contain pcd and img's meta info.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding.
-            ret_target (Bool): Return targets or not.
-
-        Returns:
-            dict: Losses of GroupFree3D.
-        """
-        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
-                                   pts_semantic_mask, pts_instance_mask,
-                                   bbox_preds)
-        (sampling_targets, sampling_weights, assigned_size_targets,
-         size_class_targets, size_res_targets, dir_class_targets,
-         dir_res_targets, center_targets, assigned_center_targets,
-         mask_targets, valid_gt_masks, objectness_targets, objectness_weights,
-         box_loss_weights, valid_gt_weights) = targets
-
-        batch_size, proposal_num = size_class_targets.shape[:2]
-
-        losses = dict()
-
-        # calculate objectness classification loss
-        sampling_obj_score = bbox_preds['seeds_obj_cls_logits'].reshape(-1, 1)
-        sampling_objectness_loss = self.sampling_objectness_loss(
-            sampling_obj_score,
-            1 - sampling_targets.reshape(-1),
-            sampling_weights.reshape(-1),
-            avg_factor=batch_size)
-        losses['sampling_objectness_loss'] = sampling_objectness_loss
-
-        prefixes = ['proposal.'] + [
-            f's{i}.' for i in range(bbox_preds['num_decoder_layers'])
-        ]
-        num_stages = len(prefixes)
-        for prefix in prefixes:
-
-            # calculate objectness loss
-            obj_score = bbox_preds[f'{prefix}obj_scores'].transpose(2, 1)
-            objectness_loss = self.objectness_loss(
-                obj_score.reshape(-1, 1),
-                1 - objectness_targets.reshape(-1),
-                objectness_weights.reshape(-1),
-                avg_factor=batch_size)
-            losses[f'{prefix}objectness_loss'] = objectness_loss / num_stages
-
-            # calculate center loss
-            box_loss_weights_expand = box_loss_weights.unsqueeze(-1).expand(
-                -1, -1, 3)
-            center_loss = self.center_loss(
-                bbox_preds[f'{prefix}center'],
-                assigned_center_targets,
-                weight=box_loss_weights_expand)
-            losses[f'{prefix}center_loss'] = center_loss / num_stages
-
-            # calculate direction class loss
-            dir_class_loss = self.dir_class_loss(
-                bbox_preds[f'{prefix}dir_class'].transpose(2, 1),
-                dir_class_targets,
-                weight=box_loss_weights)
-            losses[f'{prefix}dir_class_loss'] = dir_class_loss / num_stages
-
-            # calculate direction residual loss
-            heading_label_one_hot = size_class_targets.new_zeros(
-                (batch_size, proposal_num, self.num_dir_bins))
-            heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1),
-                                           1)
-            dir_res_norm = torch.sum(
-                bbox_preds[f'{prefix}dir_res_norm'] * heading_label_one_hot,
-                -1)
-            dir_res_loss = self.dir_res_loss(
-                dir_res_norm, dir_res_targets, weight=box_loss_weights)
-            losses[f'{prefix}dir_res_loss'] = dir_res_loss / num_stages
-
-            if self.size_cls_agnostic:
-                # calculate class-agnostic size loss
-                size_reg_loss = self.size_reg_loss(
-                    bbox_preds[f'{prefix}size'],
-                    assigned_size_targets,
-                    weight=box_loss_weights_expand)
-                losses[f'{prefix}size_reg_loss'] = size_reg_loss / num_stages
-
-            else:
-                # calculate size class loss
-                size_class_loss = self.size_class_loss(
-                    bbox_preds[f'{prefix}size_class'].transpose(2, 1),
-                    size_class_targets,
-                    weight=box_loss_weights)
-                losses[
-                    f'{prefix}size_class_loss'] = size_class_loss / num_stages
-
-                # calculate size residual loss
-                one_hot_size_targets = size_class_targets.new_zeros(
-                    (batch_size, proposal_num, self.num_sizes))
-                one_hot_size_targets.scatter_(2,
-                                              size_class_targets.unsqueeze(-1),
-                                              1)
-                one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
-                    -1).expand(-1, -1, -1, 3).contiguous()
-                size_residual_norm = torch.sum(
-                    bbox_preds[f'{prefix}size_res_norm'] *
-                    one_hot_size_targets_expand, 2)
-                box_loss_weights_expand = box_loss_weights.unsqueeze(
-                    -1).expand(-1, -1, 3)
-                size_res_loss = self.size_res_loss(
-                    size_residual_norm,
-                    size_res_targets,
-                    weight=box_loss_weights_expand)
-                losses[f'{prefix}size_res_loss'] = size_res_loss / num_stages
-
-            # calculate semantic loss
-            semantic_loss = self.semantic_loss(
-                bbox_preds[f'{prefix}sem_scores'].transpose(2, 1),
-                mask_targets,
-                weight=box_loss_weights)
-            losses[f'{prefix}semantic_loss'] = semantic_loss / num_stages
-
-        if ret_target:
-            losses['targets'] = targets
-
-        return losses
-
-    def get_targets(self,
-                    points,
-                    gt_bboxes_3d,
-                    gt_labels_3d,
-                    pts_semantic_mask=None,
-                    pts_instance_mask=None,
-                    bbox_preds=None,
-                    max_gt_num=64):
-        """Generate targets of GroupFree3D head.
-
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
-            pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
-                label of each batch.
-            pts_instance_mask (list[torch.Tensor]): Point-wise instance
-                label of each batch.
-            bbox_preds (torch.Tensor): Bounding box predictions of vote head.
-            max_gt_num (int): Max number of GTs for single batch.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of GroupFree3D head.
-        """
-        # find empty example
-        valid_gt_masks = list()
-        gt_num = list()
-        for index in range(len(gt_labels_3d)):
-            if len(gt_labels_3d[index]) == 0:
-                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
-                    1, gt_bboxes_3d[index].tensor.shape[-1])
-                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
-                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
-                valid_gt_masks.append(gt_labels_3d[index].new_zeros(1))
-                gt_num.append(1)
-            else:
-                valid_gt_masks.append(gt_labels_3d[index].new_ones(
-                    gt_labels_3d[index].shape))
-                gt_num.append(gt_labels_3d[index].shape[0])
-        # max_gt_num = max(gt_num)
-
-        max_gt_nums = [max_gt_num for _ in range(len(gt_labels_3d))]
-
-        if pts_semantic_mask is None:
-            pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
-            pts_instance_mask = [None for i in range(len(gt_labels_3d))]
-
-        seed_points = [
-            bbox_preds['seed_points'][i] for i in range(len(gt_labels_3d))
-        ]
-
-        seed_indices = [
-            bbox_preds['seed_indices'][i] for i in range(len(gt_labels_3d))
-        ]
-
-        candidate_indices = [
-            bbox_preds['query_points_sample_inds'][i]
-            for i in range(len(gt_labels_3d))
-        ]
-
-        (sampling_targets, assigned_size_targets, size_class_targets,
-         size_res_targets, dir_class_targets, dir_res_targets, center_targets,
-         assigned_center_targets, mask_targets, objectness_targets,
-         objectness_masks) = multi_apply(self.get_targets_single, points,
-                                         gt_bboxes_3d, gt_labels_3d,
-                                         pts_semantic_mask, pts_instance_mask,
-                                         max_gt_nums, seed_points,
-                                         seed_indices, candidate_indices)
-
-        # pad targets as original code of GroupFree3D.
-        for index in range(len(gt_labels_3d)):
-            pad_num = max_gt_num - gt_labels_3d[index].shape[0]
-            valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num))
-
-        sampling_targets = torch.stack(sampling_targets)
-        sampling_weights = (sampling_targets >= 0).float()
-        sampling_normalizer = sampling_weights.sum(dim=1, keepdim=True).float()
-        sampling_weights /= sampling_normalizer.clamp(min=1.0)
-
-        assigned_size_targets = torch.stack(assigned_size_targets)
-        center_targets = torch.stack(center_targets)
-        valid_gt_masks = torch.stack(valid_gt_masks)
-
-        assigned_center_targets = torch.stack(assigned_center_targets)
-        objectness_targets = torch.stack(objectness_targets)
-
-        objectness_weights = torch.stack(objectness_masks)
-        cls_normalizer = objectness_weights.sum(dim=1, keepdim=True).float()
-        objectness_weights /= cls_normalizer.clamp(min=1.0)
-
-        box_loss_weights = objectness_targets.float() / (
-            objectness_targets.sum().float() + EPS)
-
-        valid_gt_weights = valid_gt_masks.float() / (
-            valid_gt_masks.sum().float() + EPS)
-
-        dir_class_targets = torch.stack(dir_class_targets)
-        dir_res_targets = torch.stack(dir_res_targets)
-        size_class_targets = torch.stack(size_class_targets)
-        size_res_targets = torch.stack(size_res_targets)
-        mask_targets = torch.stack(mask_targets)
-
-        return (sampling_targets, sampling_weights, assigned_size_targets,
-                size_class_targets, size_res_targets, dir_class_targets,
-                dir_res_targets, center_targets, assigned_center_targets,
-                mask_targets, valid_gt_masks, objectness_targets,
-                objectness_weights, box_loss_weights, valid_gt_weights)
-
-    def get_targets_single(self,
-                           points,
-                           gt_bboxes_3d,
-                           gt_labels_3d,
-                           pts_semantic_mask=None,
-                           pts_instance_mask=None,
-                           max_gt_nums=None,
-                           seed_points=None,
-                           seed_indices=None,
-                           candidate_indices=None,
-                           seed_points_obj_topk=4):
-        """Generate targets of GroupFree3D head for single batch.
-
-        Args:
-            points (torch.Tensor): Points of each batch.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
-                boxes of each batch.
-            gt_labels_3d (torch.Tensor): Labels of each batch.
-            pts_semantic_mask (torch.Tensor): Point-wise semantic
-                label of each batch.
-            pts_instance_mask (torch.Tensor): Point-wise instance
-                label of each batch.
-            max_gt_nums (int): Max number of GTs for single batch.
-            seed_points (torch.Tensor): Coordinates of seed points.
-            seed_indices (torch.Tensor): Indices of seed points.
-            candidate_indices (torch.Tensor): Indices of object candidates.
-            seed_points_obj_topk (int): k value of k-Closest Points Sampling.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of GroupFree3D head.
-        """
-
-        assert self.bbox_coder.with_rot or pts_semantic_mask is not None
-
-        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
-
-        # generate center, dir, size target
-        (center_targets, size_targets, size_class_targets, size_res_targets,
-         dir_class_targets,
-         dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
-
-        # pad targets as original code of GroupFree3D
-        pad_num = max_gt_nums - gt_labels_3d.shape[0]
-        box_label_mask = points.new_zeros([max_gt_nums])
-        box_label_mask[:gt_labels_3d.shape[0]] = 1
-
-        gt_bboxes_pad = F.pad(gt_bboxes_3d.tensor, (0, 0, 0, pad_num))
-        gt_bboxes_pad[gt_labels_3d.shape[0]:, 0:3] += 1000
-        gt_bboxes_3d = gt_bboxes_3d.new_box(gt_bboxes_pad)
-
-        gt_labels_3d = F.pad(gt_labels_3d, (0, pad_num))
-
-        center_targets = F.pad(center_targets, (0, 0, 0, pad_num), value=1000)
-        size_targets = F.pad(size_targets, (0, 0, 0, pad_num))
-        size_class_targets = F.pad(size_class_targets, (0, pad_num))
-        size_res_targets = F.pad(size_res_targets, (0, 0, 0, pad_num))
-        dir_class_targets = F.pad(dir_class_targets, (0, pad_num))
-        dir_res_targets = F.pad(dir_res_targets, (0, pad_num))
-
-        # 0. generate pts_instance_label and pts_obj_mask
-        num_points = points.shape[0]
-        pts_obj_mask = points.new_zeros([num_points], dtype=torch.long)
-        pts_instance_label = points.new_zeros([num_points],
-                                              dtype=torch.long) - 1
-
-        if self.bbox_coder.with_rot:
-            vote_targets = points.new_zeros([num_points, 4 * self.gt_per_seed])
-            vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
-            box_indices_all = gt_bboxes_3d.points_in_boxes_part(points)
-            for i in range(gt_labels_3d.shape[0]):
-                box_indices = box_indices_all[:, i]
-                indices = torch.nonzero(
-                    box_indices, as_tuple=False).squeeze(-1)
-                selected_points = points[indices]
-                pts_obj_mask[indices] = 1
-                vote_targets_tmp = vote_targets[indices]
-                votes = gt_bboxes_3d.gravity_center[i].unsqueeze(
-                    0) - selected_points[:, :3]
-
-                for j in range(self.gt_per_seed):
-                    column_indices = torch.nonzero(
-                        vote_target_idx[indices] == j,
-                        as_tuple=False).squeeze(-1)
-                    vote_targets_tmp[column_indices,
-                                     int(j * 3):int(j * 3 +
-                                                    3)] = votes[column_indices]
-                    vote_targets_tmp[column_indices,
-                                     j + 3 * self.gt_per_seed] = i
-                    if j == 0:
-                        vote_targets_tmp[
-                            column_indices, :3 *
-                            self.gt_per_seed] = votes[column_indices].repeat(
-                                1, self.gt_per_seed)
-                        vote_targets_tmp[column_indices,
-                                         3 * self.gt_per_seed:] = i
-
-                vote_targets[indices] = vote_targets_tmp
-                vote_target_idx[indices] = torch.clamp(
-                    vote_target_idx[indices] + 1, max=2)
-
-            dist = points.new_zeros([num_points, self.gt_per_seed]) + 1000
-            for j in range(self.gt_per_seed):
-                dist[:, j] = (vote_targets[:, 3 * j:3 * j + 3]**2).sum(-1)
-
-            instance_indices = torch.argmin(
-                dist, dim=-1).unsqueeze(-1) + 3 * self.gt_per_seed
-            instance_lable = torch.gather(vote_targets, 1,
-                                          instance_indices).squeeze(-1)
-            pts_instance_label = instance_lable.long()
-            pts_instance_label[pts_obj_mask == 0] = -1
-
-        elif pts_semantic_mask is not None:
-            for i in torch.unique(pts_instance_mask):
-                indices = torch.nonzero(
-                    pts_instance_mask == i, as_tuple=False).squeeze(-1)
-
-                if pts_semantic_mask[indices[0]] < self.num_classes:
-                    selected_points = points[indices, :3]
-                    center = 0.5 * (
-                        selected_points.min(0)[0] + selected_points.max(0)[0])
-
-                    delta_xyz = center - center_targets
-                    instance_lable = torch.argmin((delta_xyz**2).sum(-1))
-                    pts_instance_label[indices] = instance_lable
-                    pts_obj_mask[indices] = 1
-
-        else:
-            raise NotImplementedError
-
-        # 1. generate objectness targets in sampling head
-        gt_num = gt_labels_3d.shape[0]
-        num_seed = seed_points.shape[0]
-        num_candidate = candidate_indices.shape[0]
-
-        object_assignment = torch.gather(pts_instance_label, 0, seed_indices)
-        # set background points to the last gt bbox as original code
-        object_assignment[object_assignment < 0] = gt_num - 1
-        object_assignment_one_hot = gt_bboxes_3d.tensor.new_zeros(
-            (num_seed, gt_num))
-        object_assignment_one_hot.scatter_(1, object_assignment.unsqueeze(-1),
-                                           1)  # (num_seed, gt_num)
-
-        delta_xyz = seed_points.unsqueeze(
-            1) - gt_bboxes_3d.gravity_center.unsqueeze(
-                0)  # (num_seed, gt_num, 3)
-        delta_xyz = delta_xyz / (gt_bboxes_3d.dims.unsqueeze(0) + EPS)
-
-        new_dist = torch.sum(delta_xyz**2, dim=-1)
-        euclidean_dist1 = torch.sqrt(new_dist + EPS)
-        euclidean_dist1 = euclidean_dist1 * object_assignment_one_hot + 100 * (
-            1 - object_assignment_one_hot)
-        # (gt_num, num_seed)
-        euclidean_dist1 = euclidean_dist1.permute(1, 0)
-
-        # gt_num x topk
-        topk_inds = torch.topk(
-            euclidean_dist1,
-            seed_points_obj_topk,
-            largest=False)[1] * box_label_mask[:, None] + \
-            (box_label_mask[:, None] - 1)
-        topk_inds = topk_inds.long()
-        topk_inds = topk_inds.view(-1).contiguous()
-
-        sampling_targets = torch.zeros(
-            num_seed + 1, dtype=torch.long).to(points.device)
-        sampling_targets[topk_inds] = 1
-        sampling_targets = sampling_targets[:num_seed]
-        # pts_instance_label
-        objectness_label_mask = torch.gather(pts_instance_label, 0,
-                                             seed_indices)  # num_seed
-        sampling_targets[objectness_label_mask < 0] = 0
-
-        # 2. objectness target
-        seed_obj_gt = torch.gather(pts_obj_mask, 0, seed_indices)  # num_seed
-        objectness_targets = torch.gather(seed_obj_gt, 0,
-                                          candidate_indices)  # num_candidate
-
-        # 3. box target
-        seed_instance_label = torch.gather(pts_instance_label, 0,
-                                           seed_indices)  # num_seed
-        query_points_instance_label = torch.gather(
-            seed_instance_label, 0, candidate_indices)  # num_candidate
-
-        # Set assignment
-        # (num_candidate, ) with values in 0,1,...,gt_num-1
-        assignment = query_points_instance_label
-        # set background points to the last gt bbox as original code
-        assignment[assignment < 0] = gt_num - 1
-        assignment_expand = assignment.unsqueeze(1).expand(-1, 3)
-
-        assigned_center_targets = center_targets[assignment]
-        assigned_size_targets = size_targets[assignment]
-
-        dir_class_targets = dir_class_targets[assignment]
-        dir_res_targets = dir_res_targets[assignment]
-        dir_res_targets /= (np.pi / self.num_dir_bins)
-
-        size_class_targets = size_class_targets[assignment]
-        size_res_targets = \
-            torch.gather(size_res_targets, 0, assignment_expand)
-        one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros(
-            (num_candidate, self.num_sizes))
-        one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1)
-        one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).expand(
-            -1, -1, 3)  # (num_candidate,num_size_cluster,3)
-        mean_sizes = size_res_targets.new_tensor(
-            self.bbox_coder.mean_sizes).unsqueeze(0)
-        pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1)
-        size_res_targets /= pos_mean_sizes
-
-        mask_targets = gt_labels_3d[assignment].long()
-
-        objectness_masks = points.new_ones((num_candidate))
-
-        return (sampling_targets, assigned_size_targets, size_class_targets,
-                size_res_targets, dir_class_targets, dir_res_targets,
-                center_targets, assigned_center_targets, mask_targets,
-                objectness_targets, objectness_masks)
-
-    def get_bboxes(self,
-                   points,
-                   bbox_preds,
-                   input_metas,
-                   rescale=False,
-                   use_nms=True):
-        """Generate bboxes from GroupFree3D head predictions.
-
-        Args:
-            points (torch.Tensor): Input points.
-            bbox_preds (dict): Predictions from GroupFree3D head.
-            input_metas (list[dict]): Point cloud and image's meta info.
-            rescale (bool): Whether to rescale bboxes.
-            use_nms (bool): Whether to apply NMS, skip nms postprocessing
-                while using GroupFree3D head in rpn stage.
-
-        Returns:
-            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
-        """
-        # support multi-stage predictions
-        assert self.test_cfg['prediction_stages'] in \
-            ['last', 'all', 'last_three']
-
-        prefixes = list()
-        if self.test_cfg['prediction_stages'] == 'last':
-            prefixes = [f's{self.num_decoder_layers - 1}.']
-        elif self.test_cfg['prediction_stages'] == 'all':
-            prefixes = ['proposal.'] + \
-                [f's{i}.' for i in range(self.num_decoder_layers)]
-        elif self.test_cfg['prediction_stages'] == 'last_three':
-            prefixes = [
-                f's{i}.' for i in range(self.num_decoder_layers -
-                                        3, self.num_decoder_layers)
-            ]
-        else:
-            raise NotImplementedError
-
-        obj_scores = list()
-        sem_scores = list()
-        bbox3d = list()
-        for prefix in prefixes:
-            # decode boxes
-            obj_score = bbox_preds[f'{prefix}obj_scores'][..., -1].sigmoid()
-            sem_score = bbox_preds[f'{prefix}sem_scores'].softmax(-1)
-            bbox = self.bbox_coder.decode(bbox_preds, prefix)
-            obj_scores.append(obj_score)
-            sem_scores.append(sem_score)
-            bbox3d.append(bbox)
-
-        obj_scores = torch.cat(obj_scores, dim=1)
-        sem_scores = torch.cat(sem_scores, dim=1)
-        bbox3d = torch.cat(bbox3d, dim=1)
-
-        if use_nms:
-            batch_size = bbox3d.shape[0]
-            results = list()
-            for b in range(batch_size):
-                bbox_selected, score_selected, labels = \
-                    self.multiclass_nms_single(obj_scores[b], sem_scores[b],
-                                               bbox3d[b], points[b, ..., :3],
-                                               input_metas[b])
-                bbox = input_metas[b]['box_type_3d'](
-                    bbox_selected,
-                    box_dim=bbox_selected.shape[-1],
-                    with_yaw=self.bbox_coder.with_rot)
-                results.append((bbox, score_selected, labels))
-
-            return results
-        else:
-            return bbox3d
-
-    def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
-                              input_meta):
-        """Multi-class nms in single batch.
-
-        Args:
-            obj_scores (torch.Tensor): Objectness score of bounding boxes.
-            sem_scores (torch.Tensor): semantic class score of bounding boxes.
-            bbox (torch.Tensor): Predicted bounding boxes.
-            points (torch.Tensor): Input points.
-            input_meta (dict): Point cloud and image's meta info.
-
-        Returns:
-            tuple[torch.Tensor]: Bounding boxes, scores and labels.
-        """
-        bbox = input_meta['box_type_3d'](
-            bbox,
-            box_dim=bbox.shape[-1],
-            with_yaw=self.bbox_coder.with_rot,
-            origin=(0.5, 0.5, 0.5))
-        box_indices = bbox.points_in_boxes_all(points)
-
-        corner3d = bbox.corners
-        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
-        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
-        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
-
-        nonempty_box_mask = box_indices.T.sum(1) > 5
-
-        bbox_classes = torch.argmax(sem_scores, -1)
-        nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
-                                      obj_scores[nonempty_box_mask],
-                                      bbox_classes[nonempty_box_mask],
-                                      self.test_cfg.nms_thr)
-
-        # filter empty boxes and boxes with low score
-        scores_mask = (obj_scores > self.test_cfg.score_thr)
-        nonempty_box_inds = torch.nonzero(
-            nonempty_box_mask, as_tuple=False).flatten()
-        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
-            0, nonempty_box_inds[nms_selected], 1)
-        selected = (nonempty_mask.bool() & scores_mask.bool())
-
-        if self.test_cfg.per_class_proposal:
-            bbox_selected, score_selected, labels = [], [], []
-            for k in range(sem_scores.shape[-1]):
-                bbox_selected.append(bbox[selected].tensor)
-                score_selected.append(obj_scores[selected] *
-                                      sem_scores[selected][:, k])
-                labels.append(
-                    torch.zeros_like(bbox_classes[selected]).fill_(k))
-            bbox_selected = torch.cat(bbox_selected, 0)
-            score_selected = torch.cat(score_selected, 0)
-            labels = torch.cat(labels, 0)
-        else:
-            bbox_selected = bbox[selected].tensor
-            score_selected = obj_scores[selected]
-            labels = bbox_classes[selected]
-
-        return bbox_selected, score_selected, labels
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import torch
+from mmcv import ConfigDict
+from mmcv.cnn import ConvModule, xavier_init
+from mmcv.cnn.bricks.transformer import (build_positional_encoding,
+                                         build_transformer_layer)
+from mmcv.ops import PointsSampler as Points_Sampler
+from mmcv.ops import gather_points
+from mmcv.runner import BaseModule, force_fp32
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.core.post_processing import aligned_3d_nms
+from mmdet.core import build_bbox_coder, multi_apply
+from ..builder import HEADS, build_loss
+from .base_conv_bbox_head import BaseConvBboxHead
+
+EPS = 1e-6
+
+
+class PointsObjClsModule(BaseModule):
+    """object candidate point prediction from seed point features.
+
+    Args:
+        in_channel (int): number of channels of seed point features.
+        num_convs (int, optional): number of conv layers.
+            Default: 3.
+        conv_cfg (dict, optional): Config of convolution.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict, optional): Config of normalization.
+            Default: dict(type='BN1d').
+        act_cfg (dict, optional): Config of activation.
+            Default: dict(type='ReLU').
+    """
+
+    def __init__(self,
+                 in_channel,
+                 num_convs=3,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        conv_channels = [in_channel for _ in range(num_convs - 1)]
+        conv_channels.append(1)
+
+        self.mlp = nn.Sequential()
+        prev_channels = in_channel
+        for i in range(num_convs):
+            self.mlp.add_module(
+                f'layer{i}',
+                ConvModule(
+                    prev_channels,
+                    conv_channels[i],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg if i < num_convs - 1 else None,
+                    act_cfg=act_cfg if i < num_convs - 1 else None,
+                    bias=True,
+                    inplace=True))
+            prev_channels = conv_channels[i]
+
+    def forward(self, seed_features):
+        """Forward pass.
+
+        Args:
+            seed_features (torch.Tensor): seed features, dims:
+                (batch_size, feature_dim, num_seed)
+
+        Returns:
+            torch.Tensor: objectness logits, dim:
+                (batch_size, 1, num_seed)
+        """
+        return self.mlp(seed_features)
+
+
+class GeneralSamplingModule(nn.Module):
+    """Sampling Points.
+
+    Sampling points with given index.
+    """
+
+    def forward(self, xyz, features, sample_inds):
+        """Forward pass.
+
+        Args:
+            xyz： (B, N, 3) the coordinates of the features.
+            features (Tensor): (B, C, N) features to sample.
+            sample_inds (Tensor): (B, M) the given index,
+                where M is the number of points.
+
+        Returns:
+            Tensor: (B, M, 3) coordinates of sampled features
+            Tensor: (B, C, M) the sampled features.
+            Tensor: (B, M) the given index.
+        """
+        xyz_t = xyz.transpose(1, 2).contiguous()
+        new_xyz = gather_points(xyz_t, sample_inds).transpose(1,
+                                                              2).contiguous()
+        new_features = gather_points(features, sample_inds).contiguous()
+
+        return new_xyz, new_features, sample_inds
+
+
+@HEADS.register_module()
+class GroupFree3DHead(BaseModule):
+    r"""Bbox head of `Group-Free 3D <https://arxiv.org/abs/2104.00678>`_.
+
+    Args:
+        num_classes (int): The number of class.
+        in_channels (int): The dims of input features from backbone.
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
+            decoding boxes.
+        num_decoder_layers (int): The number of transformer decoder layers.
+        transformerlayers (dict): Config for transformer decoder.
+        train_cfg (dict): Config for training.
+        test_cfg (dict): Config for testing.
+        num_proposal (int): The number of initial sampling candidates.
+        pred_layer_cfg (dict): Config of classfication and regression
+            prediction layers.
+        size_cls_agnostic (bool): Whether the predicted size is class-agnostic.
+        gt_per_seed (int): the number of candidate instance each point belongs
+            to.
+        sampling_objectness_loss (dict): Config of initial sampling
+            objectness loss.
+        objectness_loss (dict): Config of objectness loss.
+        center_loss (dict): Config of center loss.
+        dir_class_loss (dict): Config of direction classification loss.
+        dir_res_loss (dict): Config of direction residual regression loss.
+        size_class_loss (dict): Config of size classification loss.
+        size_res_loss (dict): Config of size residual regression loss.
+        size_reg_loss (dict): Config of class-agnostic size regression loss.
+        semantic_loss (dict): Config of point-wise semantic segmentation loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 bbox_coder,
+                 num_decoder_layers,
+                 transformerlayers,
+                 decoder_self_posembeds=dict(
+                     type='ConvBNPositionalEncoding',
+                     input_channel=6,
+                     num_pos_feats=288),
+                 decoder_cross_posembeds=dict(
+                     type='ConvBNPositionalEncoding',
+                     input_channel=3,
+                     num_pos_feats=288),
+                 train_cfg=None,
+                 test_cfg=None,
+                 num_proposal=128,
+                 pred_layer_cfg=None,
+                 size_cls_agnostic=True,
+                 gt_per_seed=3,
+                 sampling_objectness_loss=None,
+                 objectness_loss=None,
+                 center_loss=None,
+                 dir_class_loss=None,
+                 dir_res_loss=None,
+                 size_class_loss=None,
+                 size_res_loss=None,
+                 size_reg_loss=None,
+                 semantic_loss=None,
+                 init_cfg=None):
+        super(GroupFree3DHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.num_proposal = num_proposal
+        self.in_channels = in_channels
+        self.num_decoder_layers = num_decoder_layers
+        self.size_cls_agnostic = size_cls_agnostic
+        self.gt_per_seed = gt_per_seed
+
+        # Transformer decoder layers
+        if isinstance(transformerlayers, ConfigDict):
+            transformerlayers = [
+                copy.deepcopy(transformerlayers)
+                for _ in range(num_decoder_layers)
+            ]
+        else:
+            assert isinstance(transformerlayers, list) and \
+                   len(transformerlayers) == num_decoder_layers
+        self.decoder_layers = nn.ModuleList()
+        for i in range(self.num_decoder_layers):
+            self.decoder_layers.append(
+                build_transformer_layer(transformerlayers[i]))
+        self.embed_dims = self.decoder_layers[0].embed_dims
+        assert self.embed_dims == decoder_self_posembeds['num_pos_feats']
+        assert self.embed_dims == decoder_cross_posembeds['num_pos_feats']
+
+        # bbox_coder
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.num_sizes = self.bbox_coder.num_sizes
+        self.num_dir_bins = self.bbox_coder.num_dir_bins
+
+        # Initial object candidate sampling
+        self.gsample_module = GeneralSamplingModule()
+        self.fps_module = Points_Sampler([self.num_proposal])
+        self.points_obj_cls = PointsObjClsModule(self.in_channels)
+
+        self.fp16_enabled = False
+
+        # initial candidate prediction
+        self.conv_pred = BaseConvBboxHead(
+            **pred_layer_cfg,
+            num_cls_out_channels=self._get_cls_out_channels(),
+            num_reg_out_channels=self._get_reg_out_channels())
+
+        # query proj and key proj
+        self.decoder_query_proj = nn.Conv1d(
+            self.embed_dims, self.embed_dims, kernel_size=1)
+        self.decoder_key_proj = nn.Conv1d(
+            self.embed_dims, self.embed_dims, kernel_size=1)
+
+        # query position embed
+        self.decoder_self_posembeds = nn.ModuleList()
+        for _ in range(self.num_decoder_layers):
+            self.decoder_self_posembeds.append(
+                build_positional_encoding(decoder_self_posembeds))
+        # key position embed
+        self.decoder_cross_posembeds = nn.ModuleList()
+        for _ in range(self.num_decoder_layers):
+            self.decoder_cross_posembeds.append(
+                build_positional_encoding(decoder_cross_posembeds))
+
+        # Prediction Head
+        self.prediction_heads = nn.ModuleList()
+        for i in range(self.num_decoder_layers):
+            self.prediction_heads.append(
+                BaseConvBboxHead(
+                    **pred_layer_cfg,
+                    num_cls_out_channels=self._get_cls_out_channels(),
+                    num_reg_out_channels=self._get_reg_out_channels()))
+
+        self.sampling_objectness_loss = build_loss(sampling_objectness_loss)
+        self.objectness_loss = build_loss(objectness_loss)
+        self.center_loss = build_loss(center_loss)
+        self.dir_res_loss = build_loss(dir_res_loss)
+        self.dir_class_loss = build_loss(dir_class_loss)
+        self.semantic_loss = build_loss(semantic_loss)
+        if self.size_cls_agnostic:
+            self.size_reg_loss = build_loss(size_reg_loss)
+        else:
+            self.size_res_loss = build_loss(size_res_loss)
+            self.size_class_loss = build_loss(size_class_loss)
+
+    def init_weights(self):
+        """Initialize weights of transformer decoder in GroupFree3DHead."""
+        # initialize transformer
+        for m in self.decoder_layers.parameters():
+            if m.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        for m in self.decoder_self_posembeds.parameters():
+            if m.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        for m in self.decoder_cross_posembeds.parameters():
+            if m.dim() > 1:
+                xavier_init(m, distribution='uniform')
+
+    def _get_cls_out_channels(self):
+        """Return the channel number of classification outputs."""
+        # Class numbers (k) + objectness (1)
+        return self.num_classes + 1
+
+    def _get_reg_out_channels(self):
+        """Return the channel number of regression outputs."""
+        # center residual (3),
+        # heading class+residual (num_dir_bins*2),
+        # size class+residual(num_sizes*4 or 3)
+        if self.size_cls_agnostic:
+            return 6 + self.num_dir_bins * 2
+        else:
+            return 3 + self.num_dir_bins * 2 + self.num_sizes * 4
+
+    def _extract_input(self, feat_dict):
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: Coordinates of input points.
+            torch.Tensor: Features of input points.
+            torch.Tensor: Indices of input points.
+        """
+
+        seed_points = feat_dict['fp_xyz'][-1]
+        seed_features = feat_dict['fp_features'][-1]
+        seed_indices = feat_dict['fp_indices'][-1]
+
+        return seed_points, seed_features, seed_indices
+
+    def forward(self, feat_dict, sample_mod):
+        """Forward pass.
+
+        Note:
+            The forward of GroupFree3DHead is divided into 2 steps:
+
+                1. Initial object candidates sampling.
+                2. Iterative object box prediction by transformer decoder.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+            sample_mod (str): sample mode for initial candidates sampling.
+
+        Returns:
+            results (dict): Predictions of GroupFree3D head.
+        """
+        assert sample_mod in ['fps', 'kps']
+
+        seed_xyz, seed_features, seed_indices = self._extract_input(feat_dict)
+
+        results = dict(
+            seed_points=seed_xyz,
+            seed_features=seed_features,
+            seed_indices=seed_indices)
+
+        # 1. Initial object candidates sampling.
+        if sample_mod == 'fps':
+            sample_inds = self.fps_module(seed_xyz, seed_features)
+        elif sample_mod == 'kps':
+            points_obj_cls_logits = self.points_obj_cls(
+                seed_features)  # (batch_size, 1, num_seed)
+            points_obj_cls_scores = points_obj_cls_logits.sigmoid().squeeze(1)
+            sample_inds = torch.topk(points_obj_cls_scores,
+                                     self.num_proposal)[1].int()
+            results['seeds_obj_cls_logits'] = points_obj_cls_logits
+        else:
+            raise NotImplementedError(
+                f'Sample mode {sample_mod} is not supported!')
+
+        candidate_xyz, candidate_features, sample_inds = self.gsample_module(
+            seed_xyz, seed_features, sample_inds)
+
+        results['query_points_xyz'] = candidate_xyz  # (B, M, 3)
+        results['query_points_feature'] = candidate_features  # (B, C, M)
+        results['query_points_sample_inds'] = sample_inds.long()  # (B, M)
+
+        prefix = 'proposal.'
+        cls_predictions, reg_predictions = self.conv_pred(candidate_features)
+        decode_res = self.bbox_coder.split_pred(cls_predictions,
+                                                reg_predictions, candidate_xyz,
+                                                prefix)
+
+        results.update(decode_res)
+        bbox3d = self.bbox_coder.decode(results, prefix)
+
+        # 2. Iterative object box prediction by transformer decoder.
+        base_bbox3d = bbox3d[:, :, :6].detach().clone()
+
+        query = self.decoder_query_proj(candidate_features).permute(2, 0, 1)
+        key = self.decoder_key_proj(seed_features).permute(2, 0, 1)
+        value = key
+
+        # transformer decoder
+        results['num_decoder_layers'] = 0
+        for i in range(self.num_decoder_layers):
+            prefix = f's{i}.'
+
+            query_pos = self.decoder_self_posembeds[i](base_bbox3d).permute(
+                2, 0, 1)
+            key_pos = self.decoder_cross_posembeds[i](seed_xyz).permute(
+                2, 0, 1)
+
+            query = self.decoder_layers[i](
+                query, key, value, query_pos=query_pos,
+                key_pos=key_pos).permute(1, 2, 0)
+
+            results[f'{prefix}query'] = query
+
+            cls_predictions, reg_predictions = self.prediction_heads[i](query)
+            decode_res = self.bbox_coder.split_pred(cls_predictions,
+                                                    reg_predictions,
+                                                    candidate_xyz, prefix)
+            # TODO: should save bbox3d instead of decode_res?
+            results.update(decode_res)
+
+            bbox3d = self.bbox_coder.decode(results, prefix)
+            results[f'{prefix}bbox3d'] = bbox3d
+            base_bbox3d = bbox3d[:, :, :6].detach().clone()
+            query = query.permute(2, 0, 1)
+
+            results['num_decoder_layers'] += 1
+
+        return results
+
+    @force_fp32(apply_to=('bbox_preds', ))
+    def loss(self,
+             bbox_preds,
+             points,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             pts_semantic_mask=None,
+             pts_instance_mask=None,
+             img_metas=None,
+             gt_bboxes_ignore=None,
+             ret_target=False):
+        """Compute loss.
+
+        Args:
+            bbox_preds (dict): Predictions from forward of vote head.
+            points (list[torch.Tensor]): Input points.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each sample.
+            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
+            pts_semantic_mask (list[torch.Tensor]): Point-wise
+                semantic mask.
+            pts_instance_mask (list[torch.Tensor]): Point-wise
+                instance mask.
+            img_metas (list[dict]): Contain pcd and img's meta info.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding.
+            ret_target (Bool): Return targets or not.
+
+        Returns:
+            dict: Losses of GroupFree3D.
+        """
+        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
+                                   pts_semantic_mask, pts_instance_mask,
+                                   bbox_preds)
+        (sampling_targets, sampling_weights, assigned_size_targets,
+         size_class_targets, size_res_targets, dir_class_targets,
+         dir_res_targets, center_targets, assigned_center_targets,
+         mask_targets, valid_gt_masks, objectness_targets, objectness_weights,
+         box_loss_weights, valid_gt_weights) = targets
+
+        batch_size, proposal_num = size_class_targets.shape[:2]
+
+        losses = dict()
+
+        # calculate objectness classification loss
+        sampling_obj_score = bbox_preds['seeds_obj_cls_logits'].reshape(-1, 1)
+        sampling_objectness_loss = self.sampling_objectness_loss(
+            sampling_obj_score,
+            1 - sampling_targets.reshape(-1),
+            sampling_weights.reshape(-1),
+            avg_factor=batch_size)
+        losses['sampling_objectness_loss'] = sampling_objectness_loss
+
+        prefixes = ['proposal.'] + [
+            f's{i}.' for i in range(bbox_preds['num_decoder_layers'])
+        ]
+        num_stages = len(prefixes)
+        for prefix in prefixes:
+
+            # calculate objectness loss
+            obj_score = bbox_preds[f'{prefix}obj_scores'].transpose(2, 1)
+            objectness_loss = self.objectness_loss(
+                obj_score.reshape(-1, 1),
+                1 - objectness_targets.reshape(-1),
+                objectness_weights.reshape(-1),
+                avg_factor=batch_size)
+            losses[f'{prefix}objectness_loss'] = objectness_loss / num_stages
+
+            # calculate center loss
+            box_loss_weights_expand = box_loss_weights.unsqueeze(-1).expand(
+                -1, -1, 3)
+            center_loss = self.center_loss(
+                bbox_preds[f'{prefix}center'],
+                assigned_center_targets,
+                weight=box_loss_weights_expand)
+            losses[f'{prefix}center_loss'] = center_loss / num_stages
+
+            # calculate direction class loss
+            dir_class_loss = self.dir_class_loss(
+                bbox_preds[f'{prefix}dir_class'].transpose(2, 1),
+                dir_class_targets,
+                weight=box_loss_weights)
+            losses[f'{prefix}dir_class_loss'] = dir_class_loss / num_stages
+
+            # calculate direction residual loss
+            heading_label_one_hot = size_class_targets.new_zeros(
+                (batch_size, proposal_num, self.num_dir_bins))
+            heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1),
+                                           1)
+            dir_res_norm = torch.sum(
+                bbox_preds[f'{prefix}dir_res_norm'] * heading_label_one_hot,
+                -1)
+            dir_res_loss = self.dir_res_loss(
+                dir_res_norm, dir_res_targets, weight=box_loss_weights)
+            losses[f'{prefix}dir_res_loss'] = dir_res_loss / num_stages
+
+            if self.size_cls_agnostic:
+                # calculate class-agnostic size loss
+                size_reg_loss = self.size_reg_loss(
+                    bbox_preds[f'{prefix}size'],
+                    assigned_size_targets,
+                    weight=box_loss_weights_expand)
+                losses[f'{prefix}size_reg_loss'] = size_reg_loss / num_stages
+
+            else:
+                # calculate size class loss
+                size_class_loss = self.size_class_loss(
+                    bbox_preds[f'{prefix}size_class'].transpose(2, 1),
+                    size_class_targets,
+                    weight=box_loss_weights)
+                losses[
+                    f'{prefix}size_class_loss'] = size_class_loss / num_stages
+
+                # calculate size residual loss
+                one_hot_size_targets = size_class_targets.new_zeros(
+                    (batch_size, proposal_num, self.num_sizes))
+                one_hot_size_targets.scatter_(2,
+                                              size_class_targets.unsqueeze(-1),
+                                              1)
+                one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
+                    -1).expand(-1, -1, -1, 3).contiguous()
+                size_residual_norm = torch.sum(
+                    bbox_preds[f'{prefix}size_res_norm'] *
+                    one_hot_size_targets_expand, 2)
+                box_loss_weights_expand = box_loss_weights.unsqueeze(
+                    -1).expand(-1, -1, 3)
+                size_res_loss = self.size_res_loss(
+                    size_residual_norm,
+                    size_res_targets,
+                    weight=box_loss_weights_expand)
+                losses[f'{prefix}size_res_loss'] = size_res_loss / num_stages
+
+            # calculate semantic loss
+            semantic_loss = self.semantic_loss(
+                bbox_preds[f'{prefix}sem_scores'].transpose(2, 1),
+                mask_targets,
+                weight=box_loss_weights)
+            losses[f'{prefix}semantic_loss'] = semantic_loss / num_stages
+
+        if ret_target:
+            losses['targets'] = targets
+
+        return losses
+
+    def get_targets(self,
+                    points,
+                    gt_bboxes_3d,
+                    gt_labels_3d,
+                    pts_semantic_mask=None,
+                    pts_instance_mask=None,
+                    bbox_preds=None,
+                    max_gt_num=64):
+        """Generate targets of GroupFree3D head.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each batch.
+            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
+            pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (list[torch.Tensor]): Point-wise instance
+                label of each batch.
+            bbox_preds (torch.Tensor): Bounding box predictions of vote head.
+            max_gt_num (int): Max number of GTs for single batch.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of GroupFree3D head.
+        """
+        # find empty example
+        valid_gt_masks = list()
+        gt_num = list()
+        for index in range(len(gt_labels_3d)):
+            if len(gt_labels_3d[index]) == 0:
+                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
+                    1, gt_bboxes_3d[index].tensor.shape[-1])
+                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
+                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
+                valid_gt_masks.append(gt_labels_3d[index].new_zeros(1))
+                gt_num.append(1)
+            else:
+                valid_gt_masks.append(gt_labels_3d[index].new_ones(
+                    gt_labels_3d[index].shape))
+                gt_num.append(gt_labels_3d[index].shape[0])
+        # max_gt_num = max(gt_num)
+
+        max_gt_nums = [max_gt_num for _ in range(len(gt_labels_3d))]
+
+        if pts_semantic_mask is None:
+            pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
+            pts_instance_mask = [None for i in range(len(gt_labels_3d))]
+
+        seed_points = [
+            bbox_preds['seed_points'][i] for i in range(len(gt_labels_3d))
+        ]
+
+        seed_indices = [
+            bbox_preds['seed_indices'][i] for i in range(len(gt_labels_3d))
+        ]
+
+        candidate_indices = [
+            bbox_preds['query_points_sample_inds'][i]
+            for i in range(len(gt_labels_3d))
+        ]
+
+        (sampling_targets, assigned_size_targets, size_class_targets,
+         size_res_targets, dir_class_targets, dir_res_targets, center_targets,
+         assigned_center_targets, mask_targets, objectness_targets,
+         objectness_masks) = multi_apply(self.get_targets_single, points,
+                                         gt_bboxes_3d, gt_labels_3d,
+                                         pts_semantic_mask, pts_instance_mask,
+                                         max_gt_nums, seed_points,
+                                         seed_indices, candidate_indices)
+
+        # pad targets as original code of GroupFree3D.
+        for index in range(len(gt_labels_3d)):
+            pad_num = max_gt_num - gt_labels_3d[index].shape[0]
+            valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num))
+
+        sampling_targets = torch.stack(sampling_targets)
+        sampling_weights = (sampling_targets >= 0).float()
+        sampling_normalizer = sampling_weights.sum(dim=1, keepdim=True).float()
+        sampling_weights /= sampling_normalizer.clamp(min=1.0)
+
+        assigned_size_targets = torch.stack(assigned_size_targets)
+        center_targets = torch.stack(center_targets)
+        valid_gt_masks = torch.stack(valid_gt_masks)
+
+        assigned_center_targets = torch.stack(assigned_center_targets)
+        objectness_targets = torch.stack(objectness_targets)
+
+        objectness_weights = torch.stack(objectness_masks)
+        cls_normalizer = objectness_weights.sum(dim=1, keepdim=True).float()
+        objectness_weights /= cls_normalizer.clamp(min=1.0)
+
+        box_loss_weights = objectness_targets.float() / (
+            objectness_targets.sum().float() + EPS)
+
+        valid_gt_weights = valid_gt_masks.float() / (
+            valid_gt_masks.sum().float() + EPS)
+
+        dir_class_targets = torch.stack(dir_class_targets)
+        dir_res_targets = torch.stack(dir_res_targets)
+        size_class_targets = torch.stack(size_class_targets)
+        size_res_targets = torch.stack(size_res_targets)
+        mask_targets = torch.stack(mask_targets)
+
+        return (sampling_targets, sampling_weights, assigned_size_targets,
+                size_class_targets, size_res_targets, dir_class_targets,
+                dir_res_targets, center_targets, assigned_center_targets,
+                mask_targets, valid_gt_masks, objectness_targets,
+                objectness_weights, box_loss_weights, valid_gt_weights)
+
+    def get_targets_single(self,
+                           points,
+                           gt_bboxes_3d,
+                           gt_labels_3d,
+                           pts_semantic_mask=None,
+                           pts_instance_mask=None,
+                           max_gt_nums=None,
+                           seed_points=None,
+                           seed_indices=None,
+                           candidate_indices=None,
+                           seed_points_obj_topk=4):
+        """Generate targets of GroupFree3D head for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+            pts_semantic_mask (torch.Tensor): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (torch.Tensor): Point-wise instance
+                label of each batch.
+            max_gt_nums (int): Max number of GTs for single batch.
+            seed_points (torch.Tensor): Coordinates of seed points.
+            seed_indices (torch.Tensor): Indices of seed points.
+            candidate_indices (torch.Tensor): Indices of object candidates.
+            seed_points_obj_topk (int): k value of k-Closest Points Sampling.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of GroupFree3D head.
+        """
+
+        assert self.bbox_coder.with_rot or pts_semantic_mask is not None
+
+        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
+
+        # generate center, dir, size target
+        (center_targets, size_targets, size_class_targets, size_res_targets,
+         dir_class_targets,
+         dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
+
+        # pad targets as original code of GroupFree3D
+        pad_num = max_gt_nums - gt_labels_3d.shape[0]
+        box_label_mask = points.new_zeros([max_gt_nums])
+        box_label_mask[:gt_labels_3d.shape[0]] = 1
+
+        gt_bboxes_pad = F.pad(gt_bboxes_3d.tensor, (0, 0, 0, pad_num))
+        gt_bboxes_pad[gt_labels_3d.shape[0]:, 0:3] += 1000
+        gt_bboxes_3d = gt_bboxes_3d.new_box(gt_bboxes_pad)
+
+        gt_labels_3d = F.pad(gt_labels_3d, (0, pad_num))
+
+        center_targets = F.pad(center_targets, (0, 0, 0, pad_num), value=1000)
+        size_targets = F.pad(size_targets, (0, 0, 0, pad_num))
+        size_class_targets = F.pad(size_class_targets, (0, pad_num))
+        size_res_targets = F.pad(size_res_targets, (0, 0, 0, pad_num))
+        dir_class_targets = F.pad(dir_class_targets, (0, pad_num))
+        dir_res_targets = F.pad(dir_res_targets, (0, pad_num))
+
+        # 0. generate pts_instance_label and pts_obj_mask
+        num_points = points.shape[0]
+        pts_obj_mask = points.new_zeros([num_points], dtype=torch.long)
+        pts_instance_label = points.new_zeros([num_points],
+                                              dtype=torch.long) - 1
+
+        if self.bbox_coder.with_rot:
+            vote_targets = points.new_zeros([num_points, 4 * self.gt_per_seed])
+            vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
+            box_indices_all = gt_bboxes_3d.points_in_boxes_part(points)
+            for i in range(gt_labels_3d.shape[0]):
+                box_indices = box_indices_all[:, i]
+                indices = torch.nonzero(
+                    box_indices, as_tuple=False).squeeze(-1)
+                selected_points = points[indices]
+                pts_obj_mask[indices] = 1
+                vote_targets_tmp = vote_targets[indices]
+                votes = gt_bboxes_3d.gravity_center[i].unsqueeze(
+                    0) - selected_points[:, :3]
+
+                for j in range(self.gt_per_seed):
+                    column_indices = torch.nonzero(
+                        vote_target_idx[indices] == j,
+                        as_tuple=False).squeeze(-1)
+                    vote_targets_tmp[column_indices,
+                                     int(j * 3):int(j * 3 +
+                                                    3)] = votes[column_indices]
+                    vote_targets_tmp[column_indices,
+                                     j + 3 * self.gt_per_seed] = i
+                    if j == 0:
+                        vote_targets_tmp[
+                            column_indices, :3 *
+                            self.gt_per_seed] = votes[column_indices].repeat(
+                                1, self.gt_per_seed)
+                        vote_targets_tmp[column_indices,
+                                         3 * self.gt_per_seed:] = i
+
+                vote_targets[indices] = vote_targets_tmp
+                vote_target_idx[indices] = torch.clamp(
+                    vote_target_idx[indices] + 1, max=2)
+
+            dist = points.new_zeros([num_points, self.gt_per_seed]) + 1000
+            for j in range(self.gt_per_seed):
+                dist[:, j] = (vote_targets[:, 3 * j:3 * j + 3]**2).sum(-1)
+
+            instance_indices = torch.argmin(
+                dist, dim=-1).unsqueeze(-1) + 3 * self.gt_per_seed
+            instance_lable = torch.gather(vote_targets, 1,
+                                          instance_indices).squeeze(-1)
+            pts_instance_label = instance_lable.long()
+            pts_instance_label[pts_obj_mask == 0] = -1
+
+        elif pts_semantic_mask is not None:
+            for i in torch.unique(pts_instance_mask):
+                indices = torch.nonzero(
+                    pts_instance_mask == i, as_tuple=False).squeeze(-1)
+
+                if pts_semantic_mask[indices[0]] < self.num_classes:
+                    selected_points = points[indices, :3]
+                    center = 0.5 * (
+                        selected_points.min(0)[0] + selected_points.max(0)[0])
+
+                    delta_xyz = center - center_targets
+                    instance_lable = torch.argmin((delta_xyz**2).sum(-1))
+                    pts_instance_label[indices] = instance_lable
+                    pts_obj_mask[indices] = 1
+
+        else:
+            raise NotImplementedError
+
+        # 1. generate objectness targets in sampling head
+        gt_num = gt_labels_3d.shape[0]
+        num_seed = seed_points.shape[0]
+        num_candidate = candidate_indices.shape[0]
+
+        object_assignment = torch.gather(pts_instance_label, 0, seed_indices)
+        # set background points to the last gt bbox as original code
+        object_assignment[object_assignment < 0] = gt_num - 1
+        object_assignment_one_hot = gt_bboxes_3d.tensor.new_zeros(
+            (num_seed, gt_num))
+        object_assignment_one_hot.scatter_(1, object_assignment.unsqueeze(-1),
+                                           1)  # (num_seed, gt_num)
+
+        delta_xyz = seed_points.unsqueeze(
+            1) - gt_bboxes_3d.gravity_center.unsqueeze(
+                0)  # (num_seed, gt_num, 3)
+        delta_xyz = delta_xyz / (gt_bboxes_3d.dims.unsqueeze(0) + EPS)
+
+        new_dist = torch.sum(delta_xyz**2, dim=-1)
+        euclidean_dist1 = torch.sqrt(new_dist + EPS)
+        euclidean_dist1 = euclidean_dist1 * object_assignment_one_hot + 100 * (
+            1 - object_assignment_one_hot)
+        # (gt_num, num_seed)
+        euclidean_dist1 = euclidean_dist1.permute(1, 0)
+
+        # gt_num x topk
+        topk_inds = torch.topk(
+            euclidean_dist1,
+            seed_points_obj_topk,
+            largest=False)[1] * box_label_mask[:, None] + \
+            (box_label_mask[:, None] - 1)
+        topk_inds = topk_inds.long()
+        topk_inds = topk_inds.view(-1).contiguous()
+
+        sampling_targets = torch.zeros(
+            num_seed + 1, dtype=torch.long).to(points.device)
+        sampling_targets[topk_inds] = 1
+        sampling_targets = sampling_targets[:num_seed]
+        # pts_instance_label
+        objectness_label_mask = torch.gather(pts_instance_label, 0,
+                                             seed_indices)  # num_seed
+        sampling_targets[objectness_label_mask < 0] = 0
+
+        # 2. objectness target
+        seed_obj_gt = torch.gather(pts_obj_mask, 0, seed_indices)  # num_seed
+        objectness_targets = torch.gather(seed_obj_gt, 0,
+                                          candidate_indices)  # num_candidate
+
+        # 3. box target
+        seed_instance_label = torch.gather(pts_instance_label, 0,
+                                           seed_indices)  # num_seed
+        query_points_instance_label = torch.gather(
+            seed_instance_label, 0, candidate_indices)  # num_candidate
+
+        # Set assignment
+        # (num_candidate, ) with values in 0,1,...,gt_num-1
+        assignment = query_points_instance_label
+        # set background points to the last gt bbox as original code
+        assignment[assignment < 0] = gt_num - 1
+        assignment_expand = assignment.unsqueeze(1).expand(-1, 3)
+
+        assigned_center_targets = center_targets[assignment]
+        assigned_size_targets = size_targets[assignment]
+
+        dir_class_targets = dir_class_targets[assignment]
+        dir_res_targets = dir_res_targets[assignment]
+        dir_res_targets /= (np.pi / self.num_dir_bins)
+
+        size_class_targets = size_class_targets[assignment]
+        size_res_targets = \
+            torch.gather(size_res_targets, 0, assignment_expand)
+        one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros(
+            (num_candidate, self.num_sizes))
+        one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1)
+        one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).expand(
+            -1, -1, 3)  # (num_candidate,num_size_cluster,3)
+        mean_sizes = size_res_targets.new_tensor(
+            self.bbox_coder.mean_sizes).unsqueeze(0)
+        pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1)
+        size_res_targets /= pos_mean_sizes
+
+        mask_targets = gt_labels_3d[assignment].long()
+
+        objectness_masks = points.new_ones((num_candidate))
+
+        return (sampling_targets, assigned_size_targets, size_class_targets,
+                size_res_targets, dir_class_targets, dir_res_targets,
+                center_targets, assigned_center_targets, mask_targets,
+                objectness_targets, objectness_masks)
+
+    def get_bboxes(self,
+                   points,
+                   bbox_preds,
+                   input_metas,
+                   rescale=False,
+                   use_nms=True):
+        """Generate bboxes from GroupFree3D head predictions.
+
+        Args:
+            points (torch.Tensor): Input points.
+            bbox_preds (dict): Predictions from GroupFree3D head.
+            input_metas (list[dict]): Point cloud and image's meta info.
+            rescale (bool): Whether to rescale bboxes.
+            use_nms (bool): Whether to apply NMS, skip nms postprocessing
+                while using GroupFree3D head in rpn stage.
+
+        Returns:
+            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
+        """
+        # support multi-stage predictions
+        assert self.test_cfg['prediction_stages'] in \
+            ['last', 'all', 'last_three']
+
+        prefixes = list()
+        if self.test_cfg['prediction_stages'] == 'last':
+            prefixes = [f's{self.num_decoder_layers - 1}.']
+        elif self.test_cfg['prediction_stages'] == 'all':
+            prefixes = ['proposal.'] + \
+                [f's{i}.' for i in range(self.num_decoder_layers)]
+        elif self.test_cfg['prediction_stages'] == 'last_three':
+            prefixes = [
+                f's{i}.' for i in range(self.num_decoder_layers -
+                                        3, self.num_decoder_layers)
+            ]
+        else:
+            raise NotImplementedError
+
+        obj_scores = list()
+        sem_scores = list()
+        bbox3d = list()
+        for prefix in prefixes:
+            # decode boxes
+            obj_score = bbox_preds[f'{prefix}obj_scores'][..., -1].sigmoid()
+            sem_score = bbox_preds[f'{prefix}sem_scores'].softmax(-1)
+            bbox = self.bbox_coder.decode(bbox_preds, prefix)
+            obj_scores.append(obj_score)
+            sem_scores.append(sem_score)
+            bbox3d.append(bbox)
+
+        obj_scores = torch.cat(obj_scores, dim=1)
+        sem_scores = torch.cat(sem_scores, dim=1)
+        bbox3d = torch.cat(bbox3d, dim=1)
+
+        if use_nms:
+            batch_size = bbox3d.shape[0]
+            results = list()
+            for b in range(batch_size):
+                bbox_selected, score_selected, labels = \
+                    self.multiclass_nms_single(obj_scores[b], sem_scores[b],
+                                               bbox3d[b], points[b, ..., :3],
+                                               input_metas[b])
+                bbox = input_metas[b]['box_type_3d'](
+                    bbox_selected,
+                    box_dim=bbox_selected.shape[-1],
+                    with_yaw=self.bbox_coder.with_rot)
+                results.append((bbox, score_selected, labels))
+
+            return results
+        else:
+            return bbox3d
+
+    def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
+                              input_meta):
+        """Multi-class nms in single batch.
+
+        Args:
+            obj_scores (torch.Tensor): Objectness score of bounding boxes.
+            sem_scores (torch.Tensor): semantic class score of bounding boxes.
+            bbox (torch.Tensor): Predicted bounding boxes.
+            points (torch.Tensor): Input points.
+            input_meta (dict): Point cloud and image's meta info.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        bbox = input_meta['box_type_3d'](
+            bbox,
+            box_dim=bbox.shape[-1],
+            with_yaw=self.bbox_coder.with_rot,
+            origin=(0.5, 0.5, 0.5))
+        box_indices = bbox.points_in_boxes_all(points)
+
+        corner3d = bbox.corners
+        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
+        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
+        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
+
+        nonempty_box_mask = box_indices.T.sum(1) > 5
+
+        bbox_classes = torch.argmax(sem_scores, -1)
+        nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
+                                      obj_scores[nonempty_box_mask],
+                                      bbox_classes[nonempty_box_mask],
+                                      self.test_cfg.nms_thr)
+
+        # filter empty boxes and boxes with low score
+        scores_mask = (obj_scores > self.test_cfg.score_thr)
+        nonempty_box_inds = torch.nonzero(
+            nonempty_box_mask, as_tuple=False).flatten()
+        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
+            0, nonempty_box_inds[nms_selected], 1)
+        selected = (nonempty_mask.bool() & scores_mask.bool())
+
+        if self.test_cfg.per_class_proposal:
+            bbox_selected, score_selected, labels = [], [], []
+            for k in range(sem_scores.shape[-1]):
+                bbox_selected.append(bbox[selected].tensor)
+                score_selected.append(obj_scores[selected] *
+                                      sem_scores[selected][:, k])
+                labels.append(
+                    torch.zeros_like(bbox_classes[selected]).fill_(k))
+            bbox_selected = torch.cat(bbox_selected, 0)
+            score_selected = torch.cat(score_selected, 0)
+            labels = torch.cat(labels, 0)
+        else:
+            bbox_selected = bbox[selected].tensor
+            score_selected = obj_scores[selected]
+            labels = bbox_classes[selected]
+
+        return bbox_selected, score_selected, labels
diff --git a/mmdet3d/models/dense_heads/monoflex_head.py b/mmdet3d/models/dense_heads/monoflex_head.py
index 2253c75..ca1f835 100644
--- a/mmdet3d/models/dense_heads/monoflex_head.py
+++ b/mmdet3d/models/dense_heads/monoflex_head.py
@@ -1,771 +1,771 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import xavier_init
-from torch import nn as nn
-
-from mmdet3d.core.utils import get_ellip_gaussian_2D
-from mmdet3d.models.model_utils import EdgeFusionModule
-from mmdet3d.models.utils import (filter_outside_objs, get_edge_indices,
-                                  get_keypoints, handle_proj_objs)
-from mmdet.core import multi_apply
-from mmdet.core.bbox.builder import build_bbox_coder
-from mmdet.models.utils import gaussian_radius, gen_gaussian_target
-from mmdet.models.utils.gaussian_target import (get_local_maximum,
-                                                get_topk_from_heatmap,
-                                                transpose_and_gather_feat)
-from ..builder import HEADS, build_loss
-from .anchor_free_mono3d_head import AnchorFreeMono3DHead
-
-
-@HEADS.register_module()
-class MonoFlexHead(AnchorFreeMono3DHead):
-    r"""MonoFlex head used in `MonoFlex <https://arxiv.org/abs/2104.02323>`_
-
-    .. code-block:: none
-
-                / --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> cls
-                |
-                | --> 3 x 3 conv --> 1 x 1 conv --> 2d bbox
-                |
-                | --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> 2d offsets
-                |
-                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints offsets
-                |
-                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints uncertainty
-        feature
-                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints uncertainty
-                |
-                | --> 3 x 3 conv --> 1 x 1 conv -->   3d dimensions
-                |
-                |                  |--- 1 x 1 conv -->  ori cls
-                | --> 3 x 3 conv --|
-                |                  |--- 1 x 1 conv -->  ori offsets
-                |
-                | --> 3 x 3 conv --> 1 x 1 conv -->  depth
-                |
-                \ --> 3 x 3 conv --> 1 x 1 conv -->  depth uncertainty
-
-    Args:
-        use_edge_fusion (bool): Whether to use edge fusion module while
-            feature extraction.
-        edge_fusion_inds (list[tuple]): Indices of feature to use edge fusion.
-        edge_heatmap_ratio (float): Ratio of generating target heatmap.
-        filter_outside_objs (bool, optional): Whether to filter the
-            outside objects. Default: True.
-        loss_cls (dict, optional): Config of classification loss.
-            Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
-        loss_bbox (dict, optional): Config of localization loss.
-            Default: loss_bbox=dict(type='IOULoss', loss_weight=10.0).
-        loss_dir (dict, optional): Config of direction classification loss.
-            Default: dict(type='MultibinLoss', loss_weight=0.1).
-        loss_keypoints (dict, optional): Config of keypoints loss.
-            Default: dict(type='L1Loss', loss_weight=0.1).
-        loss_dims: (dict, optional): Config of dimensions loss.
-            Default: dict(type='L1Loss', loss_weight=0.1).
-        loss_offsets2d: (dict, optional): Config of offsets2d loss.
-            Default: dict(type='L1Loss', loss_weight=0.1).
-        loss_direct_depth: (dict, optional): Config of directly regression depth loss.
-            Default: dict(type='L1Loss', loss_weight=0.1).
-        loss_keypoints_depth: (dict, optional): Config of keypoints decoded depth loss.
-            Default: dict(type='L1Loss', loss_weight=0.1).
-        loss_combined_depth: (dict, optional): Config of combined depth loss.
-            Default: dict(type='L1Loss', loss_weight=0.1).
-        loss_attr (dict, optional): Config of attribute classification loss.
-            In MonoFlex, Default: None.
-        bbox_coder (dict, optional): Bbox coder for encoding and decoding boxes.
-            Default: dict(type='MonoFlexCoder', code_size=7).
-        norm_cfg (dict, optional): Dictionary to construct and config norm layer.
-            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
-        init_cfg (dict): Initialization config dict. Default: None.
-    """  # noqa: E501
-
-    def __init__(self,
-                 num_classes,
-                 in_channels,
-                 use_edge_fusion,
-                 edge_fusion_inds,
-                 edge_heatmap_ratio,
-                 filter_outside_objs=True,
-                 loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0),
-                 loss_bbox=dict(type='IoULoss', loss_weight=0.1),
-                 loss_dir=dict(type='MultiBinLoss', loss_weight=0.1),
-                 loss_keypoints=dict(type='L1Loss', loss_weight=0.1),
-                 loss_dims=dict(type='L1Loss', loss_weight=0.1),
-                 loss_offsets2d=dict(type='L1Loss', loss_weight=0.1),
-                 loss_direct_depth=dict(type='L1Loss', loss_weight=0.1),
-                 loss_keypoints_depth=dict(type='L1Loss', loss_weight=0.1),
-                 loss_combined_depth=dict(type='L1Loss', loss_weight=0.1),
-                 loss_attr=None,
-                 bbox_coder=dict(type='MonoFlexCoder', code_size=7),
-                 norm_cfg=dict(type='BN'),
-                 init_cfg=None,
-                 init_bias=-2.19,
-                 **kwargs):
-        self.use_edge_fusion = use_edge_fusion
-        self.edge_fusion_inds = edge_fusion_inds
-        super().__init__(
-            num_classes,
-            in_channels,
-            loss_cls=loss_cls,
-            loss_bbox=loss_bbox,
-            loss_dir=loss_dir,
-            loss_attr=loss_attr,
-            norm_cfg=norm_cfg,
-            init_cfg=init_cfg,
-            **kwargs)
-        self.filter_outside_objs = filter_outside_objs
-        self.edge_heatmap_ratio = edge_heatmap_ratio
-        self.init_bias = init_bias
-        self.loss_dir = build_loss(loss_dir)
-        self.loss_keypoints = build_loss(loss_keypoints)
-        self.loss_dims = build_loss(loss_dims)
-        self.loss_offsets2d = build_loss(loss_offsets2d)
-        self.loss_direct_depth = build_loss(loss_direct_depth)
-        self.loss_keypoints_depth = build_loss(loss_keypoints_depth)
-        self.loss_combined_depth = build_loss(loss_combined_depth)
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-
-    def _init_edge_module(self):
-        """Initialize edge fusion module for feature extraction."""
-        self.edge_fuse_cls = EdgeFusionModule(self.num_classes, 256)
-        for i in range(len(self.edge_fusion_inds)):
-            reg_inds, out_inds = self.edge_fusion_inds[i]
-            out_channels = self.group_reg_dims[reg_inds][out_inds]
-            fusion_layer = EdgeFusionModule(out_channels, 256)
-            layer_name = f'edge_fuse_reg_{reg_inds}_{out_inds}'
-            self.add_module(layer_name, fusion_layer)
-
-    def init_weights(self):
-        """Initialize weights."""
-        super().init_weights()
-        self.conv_cls.bias.data.fill_(self.init_bias)
-        xavier_init(self.conv_regs[4][0], gain=0.01)
-        xavier_init(self.conv_regs[7][0], gain=0.01)
-        for m in self.conv_regs.modules():
-            if isinstance(m, nn.Conv2d):
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-
-    def _init_predictor(self):
-        """Initialize predictor layers of the head."""
-        self.conv_cls_prev = self._init_branch(
-            conv_channels=self.cls_branch,
-            conv_strides=(1, ) * len(self.cls_branch))
-        self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
-                                  1)
-        # init regression head
-        self.conv_reg_prevs = nn.ModuleList()
-        # init output head
-        self.conv_regs = nn.ModuleList()
-        # group_reg_dims:
-        # ((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), (1, ))
-        for i in range(len(self.group_reg_dims)):
-            reg_dims = self.group_reg_dims[i]
-            reg_branch_channels = self.reg_branch[i]
-            out_channel = self.out_channels[i]
-            reg_list = nn.ModuleList()
-            if len(reg_branch_channels) > 0:
-                self.conv_reg_prevs.append(
-                    self._init_branch(
-                        conv_channels=reg_branch_channels,
-                        conv_strides=(1, ) * len(reg_branch_channels)))
-                for reg_dim in reg_dims:
-                    reg_list.append(nn.Conv2d(out_channel, reg_dim, 1))
-                self.conv_regs.append(reg_list)
-            else:
-                self.conv_reg_prevs.append(None)
-                for reg_dim in reg_dims:
-                    reg_list.append(nn.Conv2d(self.feat_channels, reg_dim, 1))
-                self.conv_regs.append(reg_list)
-
-    def _init_layers(self):
-        """Initialize layers of the head."""
-        self._init_predictor()
-        if self.use_edge_fusion:
-            self._init_edge_module()
-
-    def forward_train(self, x, input_metas, gt_bboxes, gt_labels, gt_bboxes_3d,
-                      gt_labels_3d, centers2d, depths, attr_labels,
-                      gt_bboxes_ignore, proposal_cfg, **kwargs):
-        """
-        Args:
-            x (list[Tensor]): Features from FPN.
-            input_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
-                shape (num_gts, 4).
-            gt_labels (list[Tensor]): Ground truth labels of each box,
-                shape (num_gts,).
-            gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
-                shape (num_gts, self.bbox_code_size).
-            gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
-                shape (num_gts,).
-            centers2d (list[Tensor]): Projected 3D center of each box,
-                shape (num_gts, 2).
-            depths (list[Tensor]): Depth of projected 3D center of each box,
-                shape (num_gts,).
-            attr_labels (list[Tensor]): Attribute labels of each box,
-                shape (num_gts,).
-            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
-                ignored, shape (num_ignored_gts, 4).
-            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
-                if None, test_cfg would be used
-        Returns:
-            tuple:
-                losses: (dict[str, Tensor]): A dictionary of loss components.
-                proposal_list (list[Tensor]): Proposals of each image.
-        """
-        outs = self(x, input_metas)
-        if gt_labels is None:
-            loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths,
-                                  attr_labels, input_metas)
-        else:
-            loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d,
-                                  gt_labels_3d, centers2d, depths, attr_labels,
-                                  input_metas)
-        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
-        if proposal_cfg is None:
-            return losses
-        else:
-            proposal_list = self.get_bboxes(
-                *outs, input_metas, cfg=proposal_cfg)
-            return losses, proposal_list
-
-    def forward(self, feats, input_metas):
-        """Forward features from the upstream network.
-
-        Args:
-            feats (list[Tensor]): Features from the upstream network, each is
-                a 4D-tensor.
-            input_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-
-        Returns:
-            tuple:
-                cls_scores (list[Tensor]): Box scores for each scale level,
-                    each is a 4D-tensor, the channel number is
-                    num_points * num_classes.
-                bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                    level, each is a 4D-tensor, the channel number is
-                    num_points * bbox_code_size.
-        """
-        mlvl_input_metas = [input_metas for i in range(len(feats))]
-        return multi_apply(self.forward_single, feats, mlvl_input_metas)
-
-    def forward_single(self, x, input_metas):
-        """Forward features of a single scale level.
-
-        Args:
-            x (Tensor): Feature maps from a specific FPN feature level.
-            input_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-
-        Returns:
-            tuple: Scores for each class, bbox predictions.
-        """
-        img_h, img_w = input_metas[0]['pad_shape'][:2]
-        batch_size, _, feat_h, feat_w = x.shape
-        downsample_ratio = img_h / feat_h
-
-        for conv_cls_prev_layer in self.conv_cls_prev:
-            cls_feat = conv_cls_prev_layer(x)
-        out_cls = self.conv_cls(cls_feat)
-
-        if self.use_edge_fusion:
-            # calculate the edge indices for the batch data
-            edge_indices_list = get_edge_indices(
-                input_metas, downsample_ratio, device=x.device)
-            edge_lens = [
-                edge_indices.shape[0] for edge_indices in edge_indices_list
-            ]
-            max_edge_len = max(edge_lens)
-            edge_indices = x.new_zeros((batch_size, max_edge_len, 2),
-                                       dtype=torch.long)
-            for i in range(batch_size):
-                edge_indices[i, :edge_lens[i]] = edge_indices_list[i]
-            # cls feature map edge fusion
-            out_cls = self.edge_fuse_cls(cls_feat, out_cls, edge_indices,
-                                         edge_lens, feat_h, feat_w)
-
-        bbox_pred = []
-
-        for i in range(len(self.group_reg_dims)):
-            reg_feat = x.clone()
-            # feature regression head
-            if len(self.reg_branch[i]) > 0:
-                for conv_reg_prev_layer in self.conv_reg_prevs[i]:
-                    reg_feat = conv_reg_prev_layer(reg_feat)
-
-            for j, conv_reg in enumerate(self.conv_regs[i]):
-                out_reg = conv_reg(reg_feat)
-                #  Use Edge Fusion Module
-                if self.use_edge_fusion and (i, j) in self.edge_fusion_inds:
-                    # reg feature map edge fusion
-                    out_reg = getattr(self, 'edge_fuse_reg_{}_{}'.format(
-                        i, j))(reg_feat, out_reg, edge_indices, edge_lens,
-                               feat_h, feat_w)
-                bbox_pred.append(out_reg)
-
-        bbox_pred = torch.cat(bbox_pred, dim=1)
-        cls_score = out_cls.sigmoid()  # turn to 0-1
-        cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4)
-
-        return cls_score, bbox_pred
-
-    def get_bboxes(self, cls_scores, bbox_preds, input_metas):
-        """Generate bboxes from bbox head predictions.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level.
-            bbox_preds (list[Tensor]): Box regression for each scale.
-            input_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            rescale (bool): If True, return boxes in original image space.
-        Returns:
-            list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
-                Each item in result_list is 4-tuple.
-        """
-        assert len(cls_scores) == len(bbox_preds) == 1
-        cam2imgs = torch.stack([
-            cls_scores[0].new_tensor(input_meta['cam2img'])
-            for input_meta in input_metas
-        ])
-        batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap(
-            cls_scores[0],
-            bbox_preds[0],
-            input_metas,
-            cam2imgs=cam2imgs,
-            topk=100,
-            kernel=3)
-
-        result_list = []
-        for img_id in range(len(input_metas)):
-
-            bboxes = batch_bboxes[img_id]
-            scores = batch_scores[img_id]
-            labels = batch_topk_labels[img_id]
-
-            keep_idx = scores > 0.25
-            bboxes = bboxes[keep_idx]
-            scores = scores[keep_idx]
-            labels = labels[keep_idx]
-
-            bboxes = input_metas[img_id]['box_type_3d'](
-                bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
-            attrs = None
-            result_list.append((bboxes, scores, labels, attrs))
-
-        return result_list
-
-    def decode_heatmap(self,
-                       cls_score,
-                       reg_pred,
-                       input_metas,
-                       cam2imgs,
-                       topk=100,
-                       kernel=3):
-        """Transform outputs into detections raw bbox predictions.
-
-        Args:
-            class_score (Tensor): Center predict heatmap,
-                shape (B, num_classes, H, W).
-            reg_pred (Tensor): Box regression map.
-                shape (B, channel, H , W).
-            input_metas (List[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            cam2imgs (Tensor): Camera intrinsic matrix.
-                shape (N, 4, 4)
-            topk (int, optional): Get top k center keypoints from heatmap.
-                Default 100.
-            kernel (int, optional): Max pooling kernel for extract local
-                maximum pixels. Default 3.
-
-        Returns:
-            tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
-               the following Tensors:
-              - batch_bboxes (Tensor): Coords of each 3D box.
-                    shape (B, k, 7)
-              - batch_scores (Tensor): Scores of each 3D box.
-                    shape (B, k)
-              - batch_topk_labels (Tensor): Categories of each 3D box.
-                    shape (B, k)
-        """
-        img_h, img_w = input_metas[0]['pad_shape'][:2]
-        batch_size, _, feat_h, feat_w = cls_score.shape
-
-        downsample_ratio = img_h / feat_h
-        center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel)
-
-        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
-            center_heatmap_pred, k=topk)
-        batch_scores, batch_index, batch_topk_labels = batch_dets
-
-        regression = transpose_and_gather_feat(reg_pred, batch_index)
-        regression = regression.view(-1, 8)
-
-        pred_base_centers2d = torch.cat(
-            [topk_xs.view(-1, 1),
-             topk_ys.view(-1, 1).float()], dim=1)
-        preds = self.bbox_coder.decode(regression, batch_topk_labels,
-                                       downsample_ratio, cam2imgs)
-        pred_locations = self.bbox_coder.decode_location(
-            pred_base_centers2d, preds['offsets2d'], preds['combined_depth'],
-            cam2imgs, downsample_ratio)
-        pred_yaws = self.bbox_coder.decode_orientation(
-            preds['orientations']).unsqueeze(-1)
-        pred_dims = preds['dimensions']
-        batch_bboxes = torch.cat((pred_locations, pred_dims, pred_yaws), dim=1)
-        batch_bboxes = batch_bboxes.view(batch_size, -1, self.bbox_code_size)
-        return batch_bboxes, batch_scores, batch_topk_labels
-
-    def get_predictions(self, pred_reg, labels3d, centers2d, reg_mask,
-                        batch_indices, input_metas, downsample_ratio):
-        """Prepare predictions for computing loss.
-
-        Args:
-            pred_reg (Tensor): Box regression map.
-                shape (B, channel, H , W).
-            labels3d (Tensor): Labels of each 3D box.
-                shape (B * max_objs, )
-            centers2d (Tensor): Coords of each projected 3D box
-                center on image. shape (N, 2)
-            reg_mask (Tensor): Indexes of the existence of the 3D box.
-                shape (B * max_objs, )
-            batch_indices (Tenosr): Batch indices of the 3D box.
-                shape (N, 3)
-            input_metas (list[dict]): Meta information of each image,
-                e.g., image size, scaling factor, etc.
-            downsample_ratio (int): The stride of feature map.
-
-        Returns:
-            dict: The predictions for computing loss.
-        """
-        batch, channel = pred_reg.shape[0], pred_reg.shape[1]
-        w = pred_reg.shape[3]
-        cam2imgs = torch.stack([
-            centers2d.new_tensor(input_meta['cam2img'])
-            for input_meta in input_metas
-        ])
-        # (batch_size, 4, 4) -> (N, 4, 4)
-        cam2imgs = cam2imgs[batch_indices, :, :]
-        centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0]
-        centers2d_inds = centers2d_inds.view(batch, -1)
-        pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds)
-        pred_regression_pois = pred_regression.view(-1, channel)[reg_mask]
-        preds = self.bbox_coder.decode(pred_regression_pois, labels3d,
-                                       downsample_ratio, cam2imgs)
-
-        return preds
-
-    def get_targets(self, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list,
-                    gt_labels_3d_list, centers2d_list, depths_list, feat_shape,
-                    img_shape, input_metas):
-        """Get training targets for batch images.
-``
-        Args:
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each
-                image, shape (num_gt, 4).
-            gt_labels_list (list[Tensor]): Ground truth labels of each
-                box, shape (num_gt,).
-            gt_bboxes_3d_list (list[:obj:`CameraInstance3DBoxes`]): 3D
-                Ground truth bboxes of each image,
-                shape (num_gt, bbox_code_size).
-            gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of
-                each box, shape (num_gt,).
-            centers2d_list (list[Tensor]): Projected 3D centers onto 2D
-                image, shape (num_gt, 2).
-            depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
-                image, each has shape (num_gt, 1).
-            feat_shape (tuple[int]): Feature map shape with value,
-                shape (B, _, H, W).
-            img_shape (tuple[int]): Image shape in [h, w] format.
-            input_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-
-        Returns:
-            tuple[Tensor, dict]: The Tensor value is the targets of
-                center heatmap, the dict has components below:
-              - base_centers2d_target (Tensor): Coords of each projected 3D box
-                    center on image. shape (B * max_objs, 2), [dtype: int]
-              - labels3d (Tensor): Labels of each 3D box.
-                    shape (N, )
-              - reg_mask (Tensor): Mask of the existence of the 3D box.
-                    shape (B * max_objs, )
-              - batch_indices (Tensor): Batch id of the 3D box.
-                    shape (N, )
-              - depth_target (Tensor): Depth target of each 3D box.
-                    shape (N, )
-              - keypoints2d_target (Tensor): Keypoints of each projected 3D box
-                    on image. shape (N, 10, 2)
-              - keypoints_mask (Tensor): Keypoints mask of each projected 3D
-                    box on image. shape (N, 10)
-              - keypoints_depth_mask (Tensor): Depths decoded from keypoints
-                    of each 3D box. shape (N, 3)
-              - orientations_target (Tensor): Orientation (encoded local yaw)
-                    target of each 3D box. shape (N, )
-              - offsets2d_target (Tensor): Offsets target of each projected
-                    3D box. shape (N, 2)
-              - dimensions_target (Tensor): Dimensions target of each 3D box.
-                    shape (N, 3)
-              - downsample_ratio (int): The stride of feature map.
-        """
-
-        img_h, img_w = img_shape[:2]
-        batch_size, _, feat_h, feat_w = feat_shape
-
-        width_ratio = float(feat_w / img_w)  # 1/4
-        height_ratio = float(feat_h / img_h)  # 1/4
-
-        assert width_ratio == height_ratio
-
-        # Whether to filter the objects which are not in FOV.
-        if self.filter_outside_objs:
-            filter_outside_objs(gt_bboxes_list, gt_labels_list,
-                                gt_bboxes_3d_list, gt_labels_3d_list,
-                                centers2d_list, input_metas)
-
-        # transform centers2d to base centers2d for regression and
-        # heatmap generation.
-        # centers2d = int(base_centers2d) + offsets2d
-        base_centers2d_list, offsets2d_list, trunc_mask_list = \
-            handle_proj_objs(centers2d_list, gt_bboxes_list, input_metas)
-
-        keypoints2d_list, keypoints_mask_list, keypoints_depth_mask_list = \
-            get_keypoints(gt_bboxes_3d_list, centers2d_list, input_metas)
-
-        center_heatmap_target = gt_bboxes_list[-1].new_zeros(
-            [batch_size, self.num_classes, feat_h, feat_w])
-
-        for batch_id in range(batch_size):
-            # project gt_bboxes from input image to feat map
-            gt_bboxes = gt_bboxes_list[batch_id] * width_ratio
-            gt_labels = gt_labels_list[batch_id]
-
-            # project base centers2d from input image to feat map
-            gt_base_centers2d = base_centers2d_list[batch_id] * width_ratio
-            trunc_masks = trunc_mask_list[batch_id]
-
-            for j, base_center2d in enumerate(gt_base_centers2d):
-                if trunc_masks[j]:
-                    # for outside objects, generate ellipse heatmap
-                    base_center2d_x_int, base_center2d_y_int = \
-                        base_center2d.int()
-                    scale_box_w = min(base_center2d_x_int - gt_bboxes[j][0],
-                                      gt_bboxes[j][2] - base_center2d_x_int)
-                    scale_box_h = min(base_center2d_y_int - gt_bboxes[j][1],
-                                      gt_bboxes[j][3] - base_center2d_y_int)
-                    radius_x = scale_box_w * self.edge_heatmap_ratio
-                    radius_y = scale_box_h * self.edge_heatmap_ratio
-                    radius_x, radius_y = max(0, int(radius_x)), max(
-                        0, int(radius_y))
-                    assert min(radius_x, radius_y) == 0
-                    ind = gt_labels[j]
-                    get_ellip_gaussian_2D(
-                        center_heatmap_target[batch_id, ind],
-                        [base_center2d_x_int, base_center2d_y_int], radius_x,
-                        radius_y)
-                else:
-                    base_center2d_x_int, base_center2d_y_int = \
-                        base_center2d.int()
-                    scale_box_h = (gt_bboxes[j][3] - gt_bboxes[j][1])
-                    scale_box_w = (gt_bboxes[j][2] - gt_bboxes[j][0])
-                    radius = gaussian_radius([scale_box_h, scale_box_w],
-                                             min_overlap=0.7)
-                    radius = max(0, int(radius))
-                    ind = gt_labels[j]
-                    gen_gaussian_target(
-                        center_heatmap_target[batch_id, ind],
-                        [base_center2d_x_int, base_center2d_y_int], radius)
-
-        avg_factor = max(1, center_heatmap_target.eq(1).sum())
-        num_ctrs = [centers2d.shape[0] for centers2d in centers2d_list]
-        max_objs = max(num_ctrs)
-        batch_indices = [
-            centers2d_list[0].new_full((num_ctrs[i], ), i)
-            for i in range(batch_size)
-        ]
-        batch_indices = torch.cat(batch_indices, dim=0)
-        reg_mask = torch.zeros(
-            (batch_size, max_objs),
-            dtype=torch.bool).to(base_centers2d_list[0].device)
-        gt_bboxes_3d = input_metas['box_type_3d'].cat(gt_bboxes_3d_list)
-        gt_bboxes_3d = gt_bboxes_3d.to(base_centers2d_list[0].device)
-
-        # encode original local yaw to multibin format
-        orienations_target = self.bbox_coder.encode(gt_bboxes_3d)
-
-        batch_base_centers2d = base_centers2d_list[0].new_zeros(
-            (batch_size, max_objs, 2))
-
-        for i in range(batch_size):
-            reg_mask[i, :num_ctrs[i]] = 1
-            batch_base_centers2d[i, :num_ctrs[i]] = base_centers2d_list[i]
-
-        flatten_reg_mask = reg_mask.flatten()
-
-        # transform base centers2d from input scale to output scale
-        batch_base_centers2d = batch_base_centers2d.view(-1, 2) * width_ratio
-
-        dimensions_target = gt_bboxes_3d.tensor[:, 3:6]
-        labels_3d = torch.cat(gt_labels_3d_list)
-        keypoints2d_target = torch.cat(keypoints2d_list)
-        keypoints_mask = torch.cat(keypoints_mask_list)
-        keypoints_depth_mask = torch.cat(keypoints_depth_mask_list)
-        offsets2d_target = torch.cat(offsets2d_list)
-        bboxes2d = torch.cat(gt_bboxes_list)
-
-        # transform FCOS style bbox into [x1, y1, x2, y2] format.
-        bboxes2d_target = torch.cat([bboxes2d[:, 0:2] * -1, bboxes2d[:, 2:]],
-                                    dim=-1)
-        depths = torch.cat(depths_list)
-
-        target_labels = dict(
-            base_centers2d_target=batch_base_centers2d.int(),
-            labels3d=labels_3d,
-            reg_mask=flatten_reg_mask,
-            batch_indices=batch_indices,
-            bboxes2d_target=bboxes2d_target,
-            depth_target=depths,
-            keypoints2d_target=keypoints2d_target,
-            keypoints_mask=keypoints_mask,
-            keypoints_depth_mask=keypoints_depth_mask,
-            orienations_target=orienations_target,
-            offsets2d_target=offsets2d_target,
-            dimensions_target=dimensions_target,
-            downsample_ratio=1 / width_ratio)
-
-        return center_heatmap_target, avg_factor, target_labels
-
-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             gt_bboxes,
-             gt_labels,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             centers2d,
-             depths,
-             attr_labels,
-             input_metas,
-             gt_bboxes_ignore=None):
-        """Compute loss of the head.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level.
-                shape (num_gt, 4).
-            bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
-                number is bbox_code_size.
-                shape (B, 7, H, W).
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each image.
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[Tensor]): Class indices corresponding to each box.
-                shape (num_gts, ).
-            gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground
-                truth. it is the flipped gt_bboxes
-            gt_labels_3d (list[Tensor]): Same as gt_labels.
-            centers2d (list[Tensor]): 2D centers on the image.
-                shape (num_gts, 2).
-            depths (list[Tensor]): Depth ground truth.
-                shape (num_gts, ).
-            attr_labels (list[Tensor]): Attributes indices of each box.
-                In kitti it's None.
-            input_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
-                boxes can be ignored when computing the loss.
-                Default: None.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-        assert len(cls_scores) == len(bbox_preds) == 1
-        assert attr_labels is None
-        assert gt_bboxes_ignore is None
-        center2d_heatmap = cls_scores[0]
-        pred_reg = bbox_preds[0]
-
-        center2d_heatmap_target, avg_factor, target_labels = \
-            self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d,
-                             gt_labels_3d, centers2d, depths,
-                             center2d_heatmap.shape,
-                             input_metas[0]['pad_shape'],
-                             input_metas)
-
-        preds = self.get_predictions(
-            pred_reg=pred_reg,
-            labels3d=target_labels['labels3d'],
-            centers2d=target_labels['base_centers2d_target'],
-            reg_mask=target_labels['reg_mask'],
-            batch_indices=target_labels['batch_indices'],
-            input_metas=input_metas,
-            downsample_ratio=target_labels['downsample_ratio'])
-
-        # heatmap loss
-        loss_cls = self.loss_cls(
-            center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor)
-
-        # bbox2d regression loss
-        loss_bbox = self.loss_bbox(preds['bboxes2d'],
-                                   target_labels['bboxes2d_target'])
-
-        # keypoints loss, the keypoints in predictions and target are all
-        # local coordinates. Check the mask dtype should be bool, not int
-        # or float to ensure the indexing is bool index
-        keypoints2d_mask = target_labels['keypoints2d_mask']
-        loss_keypoints = self.loss_keypoints(
-            preds['keypoints2d'][keypoints2d_mask],
-            target_labels['keypoints2d_target'][keypoints2d_mask])
-
-        # orientations loss
-        loss_dir = self.loss_dir(preds['orientations'],
-                                 target_labels['orientations_target'])
-
-        # dimensions loss
-        loss_dims = self.loss_dims(preds['dimensions'],
-                                   target_labels['dimensions_target'])
-
-        # offsets for center heatmap
-        loss_offsets2d = self.loss_offsets2d(preds['offsets2d'],
-                                             target_labels['offsets2d_target'])
-
-        # directly regressed depth loss with direct depth uncertainty loss
-        direct_depth_weights = torch.exp(-preds['direct_depth_uncertainty'])
-        loss_weight_1 = self.loss_direct_depth.loss_weight
-        loss_direct_depth = self.loss_direct_depth(
-            preds['direct_depth'], target_labels['depth_target'],
-            direct_depth_weights)
-        loss_uncertainty_1 =\
-            preds['direct_depth_uncertainty'] * loss_weight_1
-        loss_direct_depth = loss_direct_depth + loss_uncertainty_1.mean()
-
-        # keypoints decoded depth loss with keypoints depth uncertainty loss
-        depth_mask = target_labels['keypoints_depth_mask']
-        depth_target = target_labels['depth_target'].unsqueeze(-1).repeat(1, 3)
-        valid_keypoints_depth_uncertainty = preds[
-            'keypoints_depth_uncertainty'][depth_mask]
-        valid_keypoints_depth_weights = torch.exp(
-            -valid_keypoints_depth_uncertainty)
-        loss_keypoints_depth = self.loss_keypoint_depth(
-            preds['keypoints_depth'][depth_mask], depth_target[depth_mask],
-            valid_keypoints_depth_weights)
-        loss_weight_2 = self.loss_keypoints_depth.loss_weight
-        loss_uncertainty_2 =\
-            valid_keypoints_depth_uncertainty * loss_weight_2
-        loss_keypoints_depth = loss_keypoints_depth + loss_uncertainty_2.mean()
-
-        # combined depth loss for optimiaze the uncertainty
-        loss_combined_depth = self.loss_combined_depth(
-            preds['combined_depth'], target_labels['depth_target'])
-
-        loss_dict = dict(
-            loss_cls=loss_cls,
-            loss_bbox=loss_bbox,
-            loss_keypoints=loss_keypoints,
-            loss_dir=loss_dir,
-            loss_dims=loss_dims,
-            loss_offsets2d=loss_offsets2d,
-            loss_direct_depth=loss_direct_depth,
-            loss_keypoints_depth=loss_keypoints_depth,
-            loss_combined_depth=loss_combined_depth)
-
-        return loss_dict
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import xavier_init
+from torch import nn as nn
+
+from mmdet3d.core.utils import get_ellip_gaussian_2D
+from mmdet3d.models.model_utils import EdgeFusionModule
+from mmdet3d.models.utils import (filter_outside_objs, get_edge_indices,
+                                  get_keypoints, handle_proj_objs)
+from mmdet.core import multi_apply
+from mmdet.core.bbox.builder import build_bbox_coder
+from mmdet.models.utils import gaussian_radius, gen_gaussian_target
+from mmdet.models.utils.gaussian_target import (get_local_maximum,
+                                                get_topk_from_heatmap,
+                                                transpose_and_gather_feat)
+from ..builder import HEADS, build_loss
+from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+
+
+@HEADS.register_module()
+class MonoFlexHead(AnchorFreeMono3DHead):
+    r"""MonoFlex head used in `MonoFlex <https://arxiv.org/abs/2104.02323>`_
+
+    .. code-block:: none
+
+                / --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> cls
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv --> 2d bbox
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> 2d offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints uncertainty
+        feature
+                | --> 3 x 3 conv --> 1 x 1 conv -->  keypoints uncertainty
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->   3d dimensions
+                |
+                |                  |--- 1 x 1 conv -->  ori cls
+                | --> 3 x 3 conv --|
+                |                  |--- 1 x 1 conv -->  ori offsets
+                |
+                | --> 3 x 3 conv --> 1 x 1 conv -->  depth
+                |
+                \ --> 3 x 3 conv --> 1 x 1 conv -->  depth uncertainty
+
+    Args:
+        use_edge_fusion (bool): Whether to use edge fusion module while
+            feature extraction.
+        edge_fusion_inds (list[tuple]): Indices of feature to use edge fusion.
+        edge_heatmap_ratio (float): Ratio of generating target heatmap.
+        filter_outside_objs (bool, optional): Whether to filter the
+            outside objects. Default: True.
+        loss_cls (dict, optional): Config of classification loss.
+            Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
+        loss_bbox (dict, optional): Config of localization loss.
+            Default: loss_bbox=dict(type='IOULoss', loss_weight=10.0).
+        loss_dir (dict, optional): Config of direction classification loss.
+            Default: dict(type='MultibinLoss', loss_weight=0.1).
+        loss_keypoints (dict, optional): Config of keypoints loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_dims: (dict, optional): Config of dimensions loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_offsets2d: (dict, optional): Config of offsets2d loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_direct_depth: (dict, optional): Config of directly regression depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_keypoints_depth: (dict, optional): Config of keypoints decoded depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_combined_depth: (dict, optional): Config of combined depth loss.
+            Default: dict(type='L1Loss', loss_weight=0.1).
+        loss_attr (dict, optional): Config of attribute classification loss.
+            In MonoFlex, Default: None.
+        bbox_coder (dict, optional): Bbox coder for encoding and decoding boxes.
+            Default: dict(type='MonoFlexCoder', code_size=7).
+        norm_cfg (dict, optional): Dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        init_cfg (dict): Initialization config dict. Default: None.
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 use_edge_fusion,
+                 edge_fusion_inds,
+                 edge_heatmap_ratio,
+                 filter_outside_objs=True,
+                 loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0),
+                 loss_bbox=dict(type='IoULoss', loss_weight=0.1),
+                 loss_dir=dict(type='MultiBinLoss', loss_weight=0.1),
+                 loss_keypoints=dict(type='L1Loss', loss_weight=0.1),
+                 loss_dims=dict(type='L1Loss', loss_weight=0.1),
+                 loss_offsets2d=dict(type='L1Loss', loss_weight=0.1),
+                 loss_direct_depth=dict(type='L1Loss', loss_weight=0.1),
+                 loss_keypoints_depth=dict(type='L1Loss', loss_weight=0.1),
+                 loss_combined_depth=dict(type='L1Loss', loss_weight=0.1),
+                 loss_attr=None,
+                 bbox_coder=dict(type='MonoFlexCoder', code_size=7),
+                 norm_cfg=dict(type='BN'),
+                 init_cfg=None,
+                 init_bias=-2.19,
+                 **kwargs):
+        self.use_edge_fusion = use_edge_fusion
+        self.edge_fusion_inds = edge_fusion_inds
+        super().__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_dir=loss_dir,
+            loss_attr=loss_attr,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.filter_outside_objs = filter_outside_objs
+        self.edge_heatmap_ratio = edge_heatmap_ratio
+        self.init_bias = init_bias
+        self.loss_dir = build_loss(loss_dir)
+        self.loss_keypoints = build_loss(loss_keypoints)
+        self.loss_dims = build_loss(loss_dims)
+        self.loss_offsets2d = build_loss(loss_offsets2d)
+        self.loss_direct_depth = build_loss(loss_direct_depth)
+        self.loss_keypoints_depth = build_loss(loss_keypoints_depth)
+        self.loss_combined_depth = build_loss(loss_combined_depth)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+    def _init_edge_module(self):
+        """Initialize edge fusion module for feature extraction."""
+        self.edge_fuse_cls = EdgeFusionModule(self.num_classes, 256)
+        for i in range(len(self.edge_fusion_inds)):
+            reg_inds, out_inds = self.edge_fusion_inds[i]
+            out_channels = self.group_reg_dims[reg_inds][out_inds]
+            fusion_layer = EdgeFusionModule(out_channels, 256)
+            layer_name = f'edge_fuse_reg_{reg_inds}_{out_inds}'
+            self.add_module(layer_name, fusion_layer)
+
+    def init_weights(self):
+        """Initialize weights."""
+        super().init_weights()
+        self.conv_cls.bias.data.fill_(self.init_bias)
+        xavier_init(self.conv_regs[4][0], gain=0.01)
+        xavier_init(self.conv_regs[7][0], gain=0.01)
+        for m in self.conv_regs.modules():
+            if isinstance(m, nn.Conv2d):
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls_prev = self._init_branch(
+            conv_channels=self.cls_branch,
+            conv_strides=(1, ) * len(self.cls_branch))
+        self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels,
+                                  1)
+        # init regression head
+        self.conv_reg_prevs = nn.ModuleList()
+        # init output head
+        self.conv_regs = nn.ModuleList()
+        # group_reg_dims:
+        # ((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), (1, ))
+        for i in range(len(self.group_reg_dims)):
+            reg_dims = self.group_reg_dims[i]
+            reg_branch_channels = self.reg_branch[i]
+            out_channel = self.out_channels[i]
+            reg_list = nn.ModuleList()
+            if len(reg_branch_channels) > 0:
+                self.conv_reg_prevs.append(
+                    self._init_branch(
+                        conv_channels=reg_branch_channels,
+                        conv_strides=(1, ) * len(reg_branch_channels)))
+                for reg_dim in reg_dims:
+                    reg_list.append(nn.Conv2d(out_channel, reg_dim, 1))
+                self.conv_regs.append(reg_list)
+            else:
+                self.conv_reg_prevs.append(None)
+                for reg_dim in reg_dims:
+                    reg_list.append(nn.Conv2d(self.feat_channels, reg_dim, 1))
+                self.conv_regs.append(reg_list)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self._init_predictor()
+        if self.use_edge_fusion:
+            self._init_edge_module()
+
+    def forward_train(self, x, input_metas, gt_bboxes, gt_labels, gt_bboxes_3d,
+                      gt_labels_3d, centers2d, depths, attr_labels,
+                      gt_bboxes_ignore, proposal_cfg, **kwargs):
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image,
+                shape (num_gts, self.bbox_code_size).
+            gt_labels_3d (list[Tensor]): 3D ground truth labels of each box,
+                shape (num_gts,).
+            centers2d (list[Tensor]): Projected 3D center of each box,
+                shape (num_gts, 2).
+            depths (list[Tensor]): Depth of projected 3D center of each box,
+                shape (num_gts,).
+            attr_labels (list[Tensor]): Attribute labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+        Returns:
+            tuple:
+                losses: (dict[str, Tensor]): A dictionary of loss components.
+                proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outs = self(x, input_metas)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths,
+                                  attr_labels, input_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d,
+                                  gt_labels_3d, centers2d, depths, attr_labels,
+                                  input_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        if proposal_cfg is None:
+            return losses
+        else:
+            proposal_list = self.get_bboxes(
+                *outs, input_metas, cfg=proposal_cfg)
+            return losses, proposal_list
+
+    def forward(self, feats, input_metas):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (list[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+        """
+        mlvl_input_metas = [input_metas for i in range(len(feats))]
+        return multi_apply(self.forward_single, feats, mlvl_input_metas)
+
+    def forward_single(self, x, input_metas):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): Feature maps from a specific FPN feature level.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions.
+        """
+        img_h, img_w = input_metas[0]['pad_shape'][:2]
+        batch_size, _, feat_h, feat_w = x.shape
+        downsample_ratio = img_h / feat_h
+
+        for conv_cls_prev_layer in self.conv_cls_prev:
+            cls_feat = conv_cls_prev_layer(x)
+        out_cls = self.conv_cls(cls_feat)
+
+        if self.use_edge_fusion:
+            # calculate the edge indices for the batch data
+            edge_indices_list = get_edge_indices(
+                input_metas, downsample_ratio, device=x.device)
+            edge_lens = [
+                edge_indices.shape[0] for edge_indices in edge_indices_list
+            ]
+            max_edge_len = max(edge_lens)
+            edge_indices = x.new_zeros((batch_size, max_edge_len, 2),
+                                       dtype=torch.long)
+            for i in range(batch_size):
+                edge_indices[i, :edge_lens[i]] = edge_indices_list[i]
+            # cls feature map edge fusion
+            out_cls = self.edge_fuse_cls(cls_feat, out_cls, edge_indices,
+                                         edge_lens, feat_h, feat_w)
+
+        bbox_pred = []
+
+        for i in range(len(self.group_reg_dims)):
+            reg_feat = x.clone()
+            # feature regression head
+            if len(self.reg_branch[i]) > 0:
+                for conv_reg_prev_layer in self.conv_reg_prevs[i]:
+                    reg_feat = conv_reg_prev_layer(reg_feat)
+
+            for j, conv_reg in enumerate(self.conv_regs[i]):
+                out_reg = conv_reg(reg_feat)
+                #  Use Edge Fusion Module
+                if self.use_edge_fusion and (i, j) in self.edge_fusion_inds:
+                    # reg feature map edge fusion
+                    out_reg = getattr(self, 'edge_fuse_reg_{}_{}'.format(
+                        i, j))(reg_feat, out_reg, edge_indices, edge_lens,
+                               feat_h, feat_w)
+                bbox_pred.append(out_reg)
+
+        bbox_pred = torch.cat(bbox_pred, dim=1)
+        cls_score = out_cls.sigmoid()  # turn to 0-1
+        cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4)
+
+        return cls_score, bbox_pred
+
+    def get_bboxes(self, cls_scores, bbox_preds, input_metas):
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+            bbox_preds (list[Tensor]): Box regression for each scale.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+        Returns:
+            list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
+                Each item in result_list is 4-tuple.
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        cam2imgs = torch.stack([
+            cls_scores[0].new_tensor(input_meta['cam2img'])
+            for input_meta in input_metas
+        ])
+        batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap(
+            cls_scores[0],
+            bbox_preds[0],
+            input_metas,
+            cam2imgs=cam2imgs,
+            topk=100,
+            kernel=3)
+
+        result_list = []
+        for img_id in range(len(input_metas)):
+
+            bboxes = batch_bboxes[img_id]
+            scores = batch_scores[img_id]
+            labels = batch_topk_labels[img_id]
+
+            keep_idx = scores > 0.25
+            bboxes = bboxes[keep_idx]
+            scores = scores[keep_idx]
+            labels = labels[keep_idx]
+
+            bboxes = input_metas[img_id]['box_type_3d'](
+                bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
+            attrs = None
+            result_list.append((bboxes, scores, labels, attrs))
+
+        return result_list
+
+    def decode_heatmap(self,
+                       cls_score,
+                       reg_pred,
+                       input_metas,
+                       cam2imgs,
+                       topk=100,
+                       kernel=3):
+        """Transform outputs into detections raw bbox predictions.
+
+        Args:
+            class_score (Tensor): Center predict heatmap,
+                shape (B, num_classes, H, W).
+            reg_pred (Tensor): Box regression map.
+                shape (B, channel, H , W).
+            input_metas (List[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cam2imgs (Tensor): Camera intrinsic matrix.
+                shape (N, 4, 4)
+            topk (int, optional): Get top k center keypoints from heatmap.
+                Default 100.
+            kernel (int, optional): Max pooling kernel for extract local
+                maximum pixels. Default 3.
+
+        Returns:
+            tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
+               the following Tensors:
+              - batch_bboxes (Tensor): Coords of each 3D box.
+                    shape (B, k, 7)
+              - batch_scores (Tensor): Scores of each 3D box.
+                    shape (B, k)
+              - batch_topk_labels (Tensor): Categories of each 3D box.
+                    shape (B, k)
+        """
+        img_h, img_w = input_metas[0]['pad_shape'][:2]
+        batch_size, _, feat_h, feat_w = cls_score.shape
+
+        downsample_ratio = img_h / feat_h
+        center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel)
+
+        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
+            center_heatmap_pred, k=topk)
+        batch_scores, batch_index, batch_topk_labels = batch_dets
+
+        regression = transpose_and_gather_feat(reg_pred, batch_index)
+        regression = regression.view(-1, 8)
+
+        pred_base_centers2d = torch.cat(
+            [topk_xs.view(-1, 1),
+             topk_ys.view(-1, 1).float()], dim=1)
+        preds = self.bbox_coder.decode(regression, batch_topk_labels,
+                                       downsample_ratio, cam2imgs)
+        pred_locations = self.bbox_coder.decode_location(
+            pred_base_centers2d, preds['offsets2d'], preds['combined_depth'],
+            cam2imgs, downsample_ratio)
+        pred_yaws = self.bbox_coder.decode_orientation(
+            preds['orientations']).unsqueeze(-1)
+        pred_dims = preds['dimensions']
+        batch_bboxes = torch.cat((pred_locations, pred_dims, pred_yaws), dim=1)
+        batch_bboxes = batch_bboxes.view(batch_size, -1, self.bbox_code_size)
+        return batch_bboxes, batch_scores, batch_topk_labels
+
+    def get_predictions(self, pred_reg, labels3d, centers2d, reg_mask,
+                        batch_indices, input_metas, downsample_ratio):
+        """Prepare predictions for computing loss.
+
+        Args:
+            pred_reg (Tensor): Box regression map.
+                shape (B, channel, H , W).
+            labels3d (Tensor): Labels of each 3D box.
+                shape (B * max_objs, )
+            centers2d (Tensor): Coords of each projected 3D box
+                center on image. shape (N, 2)
+            reg_mask (Tensor): Indexes of the existence of the 3D box.
+                shape (B * max_objs, )
+            batch_indices (Tenosr): Batch indices of the 3D box.
+                shape (N, 3)
+            input_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            downsample_ratio (int): The stride of feature map.
+
+        Returns:
+            dict: The predictions for computing loss.
+        """
+        batch, channel = pred_reg.shape[0], pred_reg.shape[1]
+        w = pred_reg.shape[3]
+        cam2imgs = torch.stack([
+            centers2d.new_tensor(input_meta['cam2img'])
+            for input_meta in input_metas
+        ])
+        # (batch_size, 4, 4) -> (N, 4, 4)
+        cam2imgs = cam2imgs[batch_indices, :, :]
+        centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0]
+        centers2d_inds = centers2d_inds.view(batch, -1)
+        pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds)
+        pred_regression_pois = pred_regression.view(-1, channel)[reg_mask]
+        preds = self.bbox_coder.decode(pred_regression_pois, labels3d,
+                                       downsample_ratio, cam2imgs)
+
+        return preds
+
+    def get_targets(self, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list,
+                    gt_labels_3d_list, centers2d_list, depths_list, feat_shape,
+                    img_shape, input_metas):
+        """Get training targets for batch images.
+``
+        Args:
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each
+                image, shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each
+                box, shape (num_gt,).
+            gt_bboxes_3d_list (list[:obj:`CameraInstance3DBoxes`]): 3D
+                Ground truth bboxes of each image,
+                shape (num_gt, bbox_code_size).
+            gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of
+                each box, shape (num_gt,).
+            centers2d_list (list[Tensor]): Projected 3D centers onto 2D
+                image, shape (num_gt, 2).
+            depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
+                image, each has shape (num_gt, 1).
+            feat_shape (tuple[int]): Feature map shape with value,
+                shape (B, _, H, W).
+            img_shape (tuple[int]): Image shape in [h, w] format.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple[Tensor, dict]: The Tensor value is the targets of
+                center heatmap, the dict has components below:
+              - base_centers2d_target (Tensor): Coords of each projected 3D box
+                    center on image. shape (B * max_objs, 2), [dtype: int]
+              - labels3d (Tensor): Labels of each 3D box.
+                    shape (N, )
+              - reg_mask (Tensor): Mask of the existence of the 3D box.
+                    shape (B * max_objs, )
+              - batch_indices (Tensor): Batch id of the 3D box.
+                    shape (N, )
+              - depth_target (Tensor): Depth target of each 3D box.
+                    shape (N, )
+              - keypoints2d_target (Tensor): Keypoints of each projected 3D box
+                    on image. shape (N, 10, 2)
+              - keypoints_mask (Tensor): Keypoints mask of each projected 3D
+                    box on image. shape (N, 10)
+              - keypoints_depth_mask (Tensor): Depths decoded from keypoints
+                    of each 3D box. shape (N, 3)
+              - orientations_target (Tensor): Orientation (encoded local yaw)
+                    target of each 3D box. shape (N, )
+              - offsets2d_target (Tensor): Offsets target of each projected
+                    3D box. shape (N, 2)
+              - dimensions_target (Tensor): Dimensions target of each 3D box.
+                    shape (N, 3)
+              - downsample_ratio (int): The stride of feature map.
+        """
+
+        img_h, img_w = img_shape[:2]
+        batch_size, _, feat_h, feat_w = feat_shape
+
+        width_ratio = float(feat_w / img_w)  # 1/4
+        height_ratio = float(feat_h / img_h)  # 1/4
+
+        assert width_ratio == height_ratio
+
+        # Whether to filter the objects which are not in FOV.
+        if self.filter_outside_objs:
+            filter_outside_objs(gt_bboxes_list, gt_labels_list,
+                                gt_bboxes_3d_list, gt_labels_3d_list,
+                                centers2d_list, input_metas)
+
+        # transform centers2d to base centers2d for regression and
+        # heatmap generation.
+        # centers2d = int(base_centers2d) + offsets2d
+        base_centers2d_list, offsets2d_list, trunc_mask_list = \
+            handle_proj_objs(centers2d_list, gt_bboxes_list, input_metas)
+
+        keypoints2d_list, keypoints_mask_list, keypoints_depth_mask_list = \
+            get_keypoints(gt_bboxes_3d_list, centers2d_list, input_metas)
+
+        center_heatmap_target = gt_bboxes_list[-1].new_zeros(
+            [batch_size, self.num_classes, feat_h, feat_w])
+
+        for batch_id in range(batch_size):
+            # project gt_bboxes from input image to feat map
+            gt_bboxes = gt_bboxes_list[batch_id] * width_ratio
+            gt_labels = gt_labels_list[batch_id]
+
+            # project base centers2d from input image to feat map
+            gt_base_centers2d = base_centers2d_list[batch_id] * width_ratio
+            trunc_masks = trunc_mask_list[batch_id]
+
+            for j, base_center2d in enumerate(gt_base_centers2d):
+                if trunc_masks[j]:
+                    # for outside objects, generate ellipse heatmap
+                    base_center2d_x_int, base_center2d_y_int = \
+                        base_center2d.int()
+                    scale_box_w = min(base_center2d_x_int - gt_bboxes[j][0],
+                                      gt_bboxes[j][2] - base_center2d_x_int)
+                    scale_box_h = min(base_center2d_y_int - gt_bboxes[j][1],
+                                      gt_bboxes[j][3] - base_center2d_y_int)
+                    radius_x = scale_box_w * self.edge_heatmap_ratio
+                    radius_y = scale_box_h * self.edge_heatmap_ratio
+                    radius_x, radius_y = max(0, int(radius_x)), max(
+                        0, int(radius_y))
+                    assert min(radius_x, radius_y) == 0
+                    ind = gt_labels[j]
+                    get_ellip_gaussian_2D(
+                        center_heatmap_target[batch_id, ind],
+                        [base_center2d_x_int, base_center2d_y_int], radius_x,
+                        radius_y)
+                else:
+                    base_center2d_x_int, base_center2d_y_int = \
+                        base_center2d.int()
+                    scale_box_h = (gt_bboxes[j][3] - gt_bboxes[j][1])
+                    scale_box_w = (gt_bboxes[j][2] - gt_bboxes[j][0])
+                    radius = gaussian_radius([scale_box_h, scale_box_w],
+                                             min_overlap=0.7)
+                    radius = max(0, int(radius))
+                    ind = gt_labels[j]
+                    gen_gaussian_target(
+                        center_heatmap_target[batch_id, ind],
+                        [base_center2d_x_int, base_center2d_y_int], radius)
+
+        avg_factor = max(1, center_heatmap_target.eq(1).sum())
+        num_ctrs = [centers2d.shape[0] for centers2d in centers2d_list]
+        max_objs = max(num_ctrs)
+        batch_indices = [
+            centers2d_list[0].new_full((num_ctrs[i], ), i)
+            for i in range(batch_size)
+        ]
+        batch_indices = torch.cat(batch_indices, dim=0)
+        reg_mask = torch.zeros(
+            (batch_size, max_objs),
+            dtype=torch.bool).to(base_centers2d_list[0].device)
+        gt_bboxes_3d = input_metas['box_type_3d'].cat(gt_bboxes_3d_list)
+        gt_bboxes_3d = gt_bboxes_3d.to(base_centers2d_list[0].device)
+
+        # encode original local yaw to multibin format
+        orienations_target = self.bbox_coder.encode(gt_bboxes_3d)
+
+        batch_base_centers2d = base_centers2d_list[0].new_zeros(
+            (batch_size, max_objs, 2))
+
+        for i in range(batch_size):
+            reg_mask[i, :num_ctrs[i]] = 1
+            batch_base_centers2d[i, :num_ctrs[i]] = base_centers2d_list[i]
+
+        flatten_reg_mask = reg_mask.flatten()
+
+        # transform base centers2d from input scale to output scale
+        batch_base_centers2d = batch_base_centers2d.view(-1, 2) * width_ratio
+
+        dimensions_target = gt_bboxes_3d.tensor[:, 3:6]
+        labels_3d = torch.cat(gt_labels_3d_list)
+        keypoints2d_target = torch.cat(keypoints2d_list)
+        keypoints_mask = torch.cat(keypoints_mask_list)
+        keypoints_depth_mask = torch.cat(keypoints_depth_mask_list)
+        offsets2d_target = torch.cat(offsets2d_list)
+        bboxes2d = torch.cat(gt_bboxes_list)
+
+        # transform FCOS style bbox into [x1, y1, x2, y2] format.
+        bboxes2d_target = torch.cat([bboxes2d[:, 0:2] * -1, bboxes2d[:, 2:]],
+                                    dim=-1)
+        depths = torch.cat(depths_list)
+
+        target_labels = dict(
+            base_centers2d_target=batch_base_centers2d.int(),
+            labels3d=labels_3d,
+            reg_mask=flatten_reg_mask,
+            batch_indices=batch_indices,
+            bboxes2d_target=bboxes2d_target,
+            depth_target=depths,
+            keypoints2d_target=keypoints2d_target,
+            keypoints_mask=keypoints_mask,
+            keypoints_depth_mask=keypoints_depth_mask,
+            orienations_target=orienations_target,
+            offsets2d_target=offsets2d_target,
+            dimensions_target=dimensions_target,
+            downsample_ratio=1 / width_ratio)
+
+        return center_heatmap_target, avg_factor, target_labels
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             centers2d,
+             depths,
+             attr_labels,
+             input_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+                shape (num_gt, 4).
+            bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
+                number is bbox_code_size.
+                shape (B, 7, H, W).
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image.
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+                shape (num_gts, ).
+            gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground
+                truth. it is the flipped gt_bboxes
+            gt_labels_3d (list[Tensor]): Same as gt_labels.
+            centers2d (list[Tensor]): 2D centers on the image.
+                shape (num_gts, 2).
+            depths (list[Tensor]): Depth ground truth.
+                shape (num_gts, ).
+            attr_labels (list[Tensor]): Attributes indices of each box.
+                In kitti it's None.
+            input_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+                Default: None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        assert attr_labels is None
+        assert gt_bboxes_ignore is None
+        center2d_heatmap = cls_scores[0]
+        pred_reg = bbox_preds[0]
+
+        center2d_heatmap_target, avg_factor, target_labels = \
+            self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d,
+                             gt_labels_3d, centers2d, depths,
+                             center2d_heatmap.shape,
+                             input_metas[0]['pad_shape'],
+                             input_metas)
+
+        preds = self.get_predictions(
+            pred_reg=pred_reg,
+            labels3d=target_labels['labels3d'],
+            centers2d=target_labels['base_centers2d_target'],
+            reg_mask=target_labels['reg_mask'],
+            batch_indices=target_labels['batch_indices'],
+            input_metas=input_metas,
+            downsample_ratio=target_labels['downsample_ratio'])
+
+        # heatmap loss
+        loss_cls = self.loss_cls(
+            center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor)
+
+        # bbox2d regression loss
+        loss_bbox = self.loss_bbox(preds['bboxes2d'],
+                                   target_labels['bboxes2d_target'])
+
+        # keypoints loss, the keypoints in predictions and target are all
+        # local coordinates. Check the mask dtype should be bool, not int
+        # or float to ensure the indexing is bool index
+        keypoints2d_mask = target_labels['keypoints2d_mask']
+        loss_keypoints = self.loss_keypoints(
+            preds['keypoints2d'][keypoints2d_mask],
+            target_labels['keypoints2d_target'][keypoints2d_mask])
+
+        # orientations loss
+        loss_dir = self.loss_dir(preds['orientations'],
+                                 target_labels['orientations_target'])
+
+        # dimensions loss
+        loss_dims = self.loss_dims(preds['dimensions'],
+                                   target_labels['dimensions_target'])
+
+        # offsets for center heatmap
+        loss_offsets2d = self.loss_offsets2d(preds['offsets2d'],
+                                             target_labels['offsets2d_target'])
+
+        # directly regressed depth loss with direct depth uncertainty loss
+        direct_depth_weights = torch.exp(-preds['direct_depth_uncertainty'])
+        loss_weight_1 = self.loss_direct_depth.loss_weight
+        loss_direct_depth = self.loss_direct_depth(
+            preds['direct_depth'], target_labels['depth_target'],
+            direct_depth_weights)
+        loss_uncertainty_1 =\
+            preds['direct_depth_uncertainty'] * loss_weight_1
+        loss_direct_depth = loss_direct_depth + loss_uncertainty_1.mean()
+
+        # keypoints decoded depth loss with keypoints depth uncertainty loss
+        depth_mask = target_labels['keypoints_depth_mask']
+        depth_target = target_labels['depth_target'].unsqueeze(-1).repeat(1, 3)
+        valid_keypoints_depth_uncertainty = preds[
+            'keypoints_depth_uncertainty'][depth_mask]
+        valid_keypoints_depth_weights = torch.exp(
+            -valid_keypoints_depth_uncertainty)
+        loss_keypoints_depth = self.loss_keypoint_depth(
+            preds['keypoints_depth'][depth_mask], depth_target[depth_mask],
+            valid_keypoints_depth_weights)
+        loss_weight_2 = self.loss_keypoints_depth.loss_weight
+        loss_uncertainty_2 =\
+            valid_keypoints_depth_uncertainty * loss_weight_2
+        loss_keypoints_depth = loss_keypoints_depth + loss_uncertainty_2.mean()
+
+        # combined depth loss for optimiaze the uncertainty
+        loss_combined_depth = self.loss_combined_depth(
+            preds['combined_depth'], target_labels['depth_target'])
+
+        loss_dict = dict(
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_keypoints=loss_keypoints,
+            loss_dir=loss_dir,
+            loss_dims=loss_dims,
+            loss_offsets2d=loss_offsets2d,
+            loss_direct_depth=loss_direct_depth,
+            loss_keypoints_depth=loss_keypoints_depth,
+            loss_combined_depth=loss_combined_depth)
+
+        return loss_dict
diff --git a/mmdet3d/models/dense_heads/parta2_rpn_head.py b/mmdet3d/models/dense_heads/parta2_rpn_head.py
index a57e1a1..5f6c99d 100644
--- a/mmdet3d/models/dense_heads/parta2_rpn_head.py
+++ b/mmdet3d/models/dense_heads/parta2_rpn_head.py
@@ -1,310 +1,310 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from mmcv.runner import force_fp32
-
-from mmdet3d.core import limit_period, xywhr2xyxyr
-from mmdet3d.core.post_processing import nms_bev, nms_normal_bev
-from ..builder import HEADS
-from .anchor3d_head import Anchor3DHead
-
-
-@HEADS.register_module()
-class PartA2RPNHead(Anchor3DHead):
-    """RPN head for PartA2.
-
-    Note:
-        The main difference between the PartA2 RPN head and the Anchor3DHead
-        lies in their output during inference. PartA2 RPN head further returns
-        the original classification score for the second stage since the bbox
-        head in RoI head does not do classification task.
-
-        Different from RPN heads in 2D detectors, this RPN head does
-        multi-class classification task and uses FocalLoss like the SECOND and
-        PointPillars do. But this head uses class agnostic nms rather than
-        multi-class nms.
-
-    Args:
-        num_classes (int): Number of classes.
-        in_channels (int): Number of channels in the input feature map.
-        train_cfg (dict): Train configs.
-        test_cfg (dict): Test configs.
-        feat_channels (int): Number of channels of the feature map.
-        use_direction_classifier (bool): Whether to add a direction classifier.
-        anchor_generator(dict): Config dict of anchor generator.
-        assigner_per_size (bool): Whether to do assignment for each separate
-            anchor size.
-        assign_per_class (bool): Whether to do assignment for each class.
-        diff_rad_by_sin (bool): Whether to change the difference into sin
-            difference for box regression loss.
-        dir_offset (float | int): The offset of BEV rotation angles
-            (TODO: may be moved into box coder)
-        dir_limit_offset (float | int): The limited range of BEV
-            rotation angles. (TODO: may be moved into box coder)
-        bbox_coder (dict): Config dict of box coders.
-        loss_cls (dict): Config of classification loss.
-        loss_bbox (dict): Config of localization loss.
-        loss_dir (dict): Config of direction classifier loss.
-    """
-
-    def __init__(self,
-                 num_classes,
-                 in_channels,
-                 train_cfg,
-                 test_cfg,
-                 feat_channels=256,
-                 use_direction_classifier=True,
-                 anchor_generator=dict(
-                     type='Anchor3DRangeGenerator',
-                     range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
-                     strides=[2],
-                     sizes=[[3.9, 1.6, 1.56]],
-                     rotations=[0, 1.57],
-                     custom_values=[],
-                     reshape_out=False),
-                 assigner_per_size=False,
-                 assign_per_class=False,
-                 diff_rad_by_sin=True,
-                 dir_offset=-np.pi / 2,
-                 dir_limit_offset=0,
-                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
-                 loss_cls=dict(
-                     type='CrossEntropyLoss',
-                     use_sigmoid=True,
-                     loss_weight=1.0),
-                 loss_bbox=dict(
-                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
-                 loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2),
-                 init_cfg=None):
-        super().__init__(num_classes, in_channels, train_cfg, test_cfg,
-                         feat_channels, use_direction_classifier,
-                         anchor_generator, assigner_per_size, assign_per_class,
-                         diff_rad_by_sin, dir_offset, dir_limit_offset,
-                         bbox_coder, loss_cls, loss_bbox, loss_dir, init_cfg)
-
-    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             dir_cls_preds,
-             gt_bboxes,
-             gt_labels,
-             input_metas,
-             gt_bboxes_ignore=None):
-        """Calculate losses.
-
-        Args:
-            cls_scores (list[torch.Tensor]): Multi-level class scores.
-            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
-            dir_cls_preds (list[torch.Tensor]): Multi-level direction
-                class predictions.
-            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes
-                of each sample.
-            gt_labels (list[torch.Tensor]): Labels of each sample.
-            input_metas (list[dict]): Point cloud and image's meta info.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding.
-
-        Returns:
-            dict[str, list[torch.Tensor]]: Classification, bbox, and
-                direction losses of each level.
-
-                - loss_rpn_cls (list[torch.Tensor]): Classification losses.
-                - loss_rpn_bbox (list[torch.Tensor]): Box regression losses.
-                - loss_rpn_dir (list[torch.Tensor]): Direction classification
-                    losses.
-        """
-        loss_dict = super().loss(cls_scores, bbox_preds, dir_cls_preds,
-                                 gt_bboxes, gt_labels, input_metas,
-                                 gt_bboxes_ignore)
-        # change the loss key names to avoid conflict
-        return dict(
-            loss_rpn_cls=loss_dict['loss_cls'],
-            loss_rpn_bbox=loss_dict['loss_bbox'],
-            loss_rpn_dir=loss_dict['loss_dir'])
-
-    def get_bboxes_single(self,
-                          cls_scores,
-                          bbox_preds,
-                          dir_cls_preds,
-                          mlvl_anchors,
-                          input_meta,
-                          cfg,
-                          rescale=False):
-        """Get bboxes of single branch.
-
-        Args:
-            cls_scores (torch.Tensor): Class score in single batch.
-            bbox_preds (torch.Tensor): Bbox prediction in single batch.
-            dir_cls_preds (torch.Tensor): Predictions of direction class
-                in single batch.
-            mlvl_anchors (List[torch.Tensor]): Multi-level anchors
-                in single batch.
-            input_meta (list[dict]): Contain pcd and img's meta info.
-            cfg (:obj:`ConfigDict`): Training or testing config.
-            rescale (list[torch.Tensor]): whether th rescale bbox.
-
-        Returns:
-            dict: Predictions of single batch containing the following keys:
-
-                - boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
-                - scores_3d (torch.Tensor): Score of each bbox.
-                - labels_3d (torch.Tensor): Label of each bbox.
-                - cls_preds (torch.Tensor): Class score of each bbox.
-        """
-        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
-        mlvl_bboxes = []
-        mlvl_max_scores = []
-        mlvl_label_pred = []
-        mlvl_dir_scores = []
-        mlvl_cls_score = []
-        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
-                cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
-            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
-            assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
-            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
-            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
-
-            cls_score = cls_score.permute(1, 2,
-                                          0).reshape(-1, self.num_classes)
-
-            if self.use_sigmoid_cls:
-                scores = cls_score.sigmoid()
-            else:
-                scores = cls_score.softmax(-1)
-            bbox_pred = bbox_pred.permute(1, 2,
-                                          0).reshape(-1, self.box_code_size)
-
-            nms_pre = cfg.get('nms_pre', -1)
-            if self.use_sigmoid_cls:
-                max_scores, pred_labels = scores.max(dim=1)
-            else:
-                max_scores, pred_labels = scores[:, :-1].max(dim=1)
-            # get topk
-            if nms_pre > 0 and scores.shape[0] > nms_pre:
-                topk_scores, topk_inds = max_scores.topk(nms_pre)
-                anchors = anchors[topk_inds, :]
-                bbox_pred = bbox_pred[topk_inds, :]
-                max_scores = topk_scores
-                cls_score = scores[topk_inds, :]
-                dir_cls_score = dir_cls_score[topk_inds]
-                pred_labels = pred_labels[topk_inds]
-
-            bboxes = self.bbox_coder.decode(anchors, bbox_pred)
-            mlvl_bboxes.append(bboxes)
-            mlvl_max_scores.append(max_scores)
-            mlvl_cls_score.append(cls_score)
-            mlvl_label_pred.append(pred_labels)
-            mlvl_dir_scores.append(dir_cls_score)
-
-        mlvl_bboxes = torch.cat(mlvl_bboxes)
-        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
-            mlvl_bboxes, box_dim=self.box_code_size).bev)
-        mlvl_max_scores = torch.cat(mlvl_max_scores)
-        mlvl_label_pred = torch.cat(mlvl_label_pred)
-        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
-        # shape [k, num_class] before sigmoid
-        # PartA2 need to keep raw classification score
-        # because the bbox head in the second stage does not have
-        # classification branch,
-        # roi head need this score as classification score
-        mlvl_cls_score = torch.cat(mlvl_cls_score)
-
-        score_thr = cfg.get('score_thr', 0)
-        result = self.class_agnostic_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
-                                         mlvl_max_scores, mlvl_label_pred,
-                                         mlvl_cls_score, mlvl_dir_scores,
-                                         score_thr, cfg.nms_post, cfg,
-                                         input_meta)
-
-        return result
-
-    def class_agnostic_nms(self, mlvl_bboxes, mlvl_bboxes_for_nms,
-                           mlvl_max_scores, mlvl_label_pred, mlvl_cls_score,
-                           mlvl_dir_scores, score_thr, max_num, cfg,
-                           input_meta):
-        """Class agnostic nms for single batch.
-
-        Args:
-            mlvl_bboxes (torch.Tensor): Bboxes from Multi-level.
-            mlvl_bboxes_for_nms (torch.Tensor): Bboxes for nms
-                (bev or minmax boxes) from Multi-level.
-            mlvl_max_scores (torch.Tensor): Max scores of Multi-level bbox.
-            mlvl_label_pred (torch.Tensor): Class predictions
-                of Multi-level bbox.
-            mlvl_cls_score (torch.Tensor): Class scores of
-                Multi-level bbox.
-            mlvl_dir_scores (torch.Tensor): Direction scores of
-                Multi-level bbox.
-            score_thr (int): Score threshold.
-            max_num (int): Max number of bboxes after nms.
-            cfg (:obj:`ConfigDict`): Training or testing config.
-            input_meta (dict): Contain pcd and img's meta info.
-
-        Returns:
-            dict: Predictions of single batch. Contain the keys:
-
-                - boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
-                - scores_3d (torch.Tensor): Score of each bbox.
-                - labels_3d (torch.Tensor): Label of each bbox.
-                - cls_preds (torch.Tensor): Class score of each bbox.
-        """
-        bboxes = []
-        scores = []
-        labels = []
-        dir_scores = []
-        cls_scores = []
-        score_thr_inds = mlvl_max_scores > score_thr
-        _scores = mlvl_max_scores[score_thr_inds]
-        _bboxes_for_nms = mlvl_bboxes_for_nms[score_thr_inds, :]
-        if cfg.use_rotate_nms:
-            nms_func = nms_bev
-        else:
-            nms_func = nms_normal_bev
-        selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr)
-
-        _mlvl_bboxes = mlvl_bboxes[score_thr_inds, :]
-        _mlvl_dir_scores = mlvl_dir_scores[score_thr_inds]
-        _mlvl_label_pred = mlvl_label_pred[score_thr_inds]
-        _mlvl_cls_score = mlvl_cls_score[score_thr_inds]
-
-        if len(selected) > 0:
-            bboxes.append(_mlvl_bboxes[selected])
-            scores.append(_scores[selected])
-            labels.append(_mlvl_label_pred[selected])
-            cls_scores.append(_mlvl_cls_score[selected])
-            dir_scores.append(_mlvl_dir_scores[selected])
-            dir_rot = limit_period(bboxes[-1][..., 6] - self.dir_offset,
-                                   self.dir_limit_offset, np.pi)
-            bboxes[-1][..., 6] = (
-                dir_rot + self.dir_offset +
-                np.pi * dir_scores[-1].to(bboxes[-1].dtype))
-
-        if bboxes:
-            bboxes = torch.cat(bboxes, dim=0)
-            scores = torch.cat(scores, dim=0)
-            cls_scores = torch.cat(cls_scores, dim=0)
-            labels = torch.cat(labels, dim=0)
-            if bboxes.shape[0] > max_num:
-                _, inds = scores.sort(descending=True)
-                inds = inds[:max_num]
-                bboxes = bboxes[inds, :]
-                labels = labels[inds]
-                scores = scores[inds]
-                cls_scores = cls_scores[inds]
-            bboxes = input_meta['box_type_3d'](
-                bboxes, box_dim=self.box_code_size)
-            return dict(
-                boxes_3d=bboxes,
-                scores_3d=scores,
-                labels_3d=labels,
-                cls_preds=cls_scores  # raw scores [max_num, cls_num]
-            )
-        else:
-            return dict(
-                boxes_3d=input_meta['box_type_3d'](
-                    mlvl_bboxes.new_zeros([0, self.box_code_size]),
-                    box_dim=self.box_code_size),
-                scores_3d=mlvl_bboxes.new_zeros([0]),
-                labels_3d=mlvl_bboxes.new_zeros([0]),
-                cls_preds=mlvl_bboxes.new_zeros([0, mlvl_cls_score.shape[-1]]))
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.runner import force_fp32
+
+from mmdet3d.core import limit_period, xywhr2xyxyr
+from mmdet3d.core.post_processing import nms_bev, nms_normal_bev
+from ..builder import HEADS
+from .anchor3d_head import Anchor3DHead
+
+
+@HEADS.register_module()
+class PartA2RPNHead(Anchor3DHead):
+    """RPN head for PartA2.
+
+    Note:
+        The main difference between the PartA2 RPN head and the Anchor3DHead
+        lies in their output during inference. PartA2 RPN head further returns
+        the original classification score for the second stage since the bbox
+        head in RoI head does not do classification task.
+
+        Different from RPN heads in 2D detectors, this RPN head does
+        multi-class classification task and uses FocalLoss like the SECOND and
+        PointPillars do. But this head uses class agnostic nms rather than
+        multi-class nms.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        train_cfg (dict): Train configs.
+        test_cfg (dict): Test configs.
+        feat_channels (int): Number of channels of the feature map.
+        use_direction_classifier (bool): Whether to add a direction classifier.
+        anchor_generator(dict): Config dict of anchor generator.
+        assigner_per_size (bool): Whether to do assignment for each separate
+            anchor size.
+        assign_per_class (bool): Whether to do assignment for each class.
+        diff_rad_by_sin (bool): Whether to change the difference into sin
+            difference for box regression loss.
+        dir_offset (float | int): The offset of BEV rotation angles
+            (TODO: may be moved into box coder)
+        dir_limit_offset (float | int): The limited range of BEV
+            rotation angles. (TODO: may be moved into box coder)
+        bbox_coder (dict): Config dict of box coders.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        loss_dir (dict): Config of direction classifier loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 train_cfg,
+                 test_cfg,
+                 feat_channels=256,
+                 use_direction_classifier=True,
+                 anchor_generator=dict(
+                     type='Anchor3DRangeGenerator',
+                     range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
+                     strides=[2],
+                     sizes=[[3.9, 1.6, 1.56]],
+                     rotations=[0, 1.57],
+                     custom_values=[],
+                     reshape_out=False),
+                 assigner_per_size=False,
+                 assign_per_class=False,
+                 diff_rad_by_sin=True,
+                 dir_offset=-np.pi / 2,
+                 dir_limit_offset=0,
+                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+                 loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2),
+                 init_cfg=None):
+        super().__init__(num_classes, in_channels, train_cfg, test_cfg,
+                         feat_channels, use_direction_classifier,
+                         anchor_generator, assigner_per_size, assign_per_class,
+                         diff_rad_by_sin, dir_offset, dir_limit_offset,
+                         bbox_coder, loss_cls, loss_bbox, loss_dir, init_cfg)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             dir_cls_preds,
+             gt_bboxes,
+             gt_labels,
+             input_metas,
+             gt_bboxes_ignore=None):
+        """Calculate losses.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Multi-level class scores.
+            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
+            dir_cls_preds (list[torch.Tensor]): Multi-level direction
+                class predictions.
+            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes
+                of each sample.
+            gt_labels (list[torch.Tensor]): Labels of each sample.
+            input_metas (list[dict]): Point cloud and image's meta info.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding.
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Classification, bbox, and
+                direction losses of each level.
+
+                - loss_rpn_cls (list[torch.Tensor]): Classification losses.
+                - loss_rpn_bbox (list[torch.Tensor]): Box regression losses.
+                - loss_rpn_dir (list[torch.Tensor]): Direction classification
+                    losses.
+        """
+        loss_dict = super().loss(cls_scores, bbox_preds, dir_cls_preds,
+                                 gt_bboxes, gt_labels, input_metas,
+                                 gt_bboxes_ignore)
+        # change the loss key names to avoid conflict
+        return dict(
+            loss_rpn_cls=loss_dict['loss_cls'],
+            loss_rpn_bbox=loss_dict['loss_bbox'],
+            loss_rpn_dir=loss_dict['loss_dir'])
+
+    def get_bboxes_single(self,
+                          cls_scores,
+                          bbox_preds,
+                          dir_cls_preds,
+                          mlvl_anchors,
+                          input_meta,
+                          cfg,
+                          rescale=False):
+        """Get bboxes of single branch.
+
+        Args:
+            cls_scores (torch.Tensor): Class score in single batch.
+            bbox_preds (torch.Tensor): Bbox prediction in single batch.
+            dir_cls_preds (torch.Tensor): Predictions of direction class
+                in single batch.
+            mlvl_anchors (List[torch.Tensor]): Multi-level anchors
+                in single batch.
+            input_meta (list[dict]): Contain pcd and img's meta info.
+            cfg (:obj:`ConfigDict`): Training or testing config.
+            rescale (list[torch.Tensor]): whether th rescale bbox.
+
+        Returns:
+            dict: Predictions of single batch containing the following keys:
+
+                - boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
+                - scores_3d (torch.Tensor): Score of each bbox.
+                - labels_3d (torch.Tensor): Label of each bbox.
+                - cls_preds (torch.Tensor): Class score of each bbox.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_max_scores = []
+        mlvl_label_pred = []
+        mlvl_dir_scores = []
+        mlvl_cls_score = []
+        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
+                cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.num_classes)
+
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1, self.box_code_size)
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if self.use_sigmoid_cls:
+                max_scores, pred_labels = scores.max(dim=1)
+            else:
+                max_scores, pred_labels = scores[:, :-1].max(dim=1)
+            # get topk
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                topk_scores, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                max_scores = topk_scores
+                cls_score = scores[topk_inds, :]
+                dir_cls_score = dir_cls_score[topk_inds]
+                pred_labels = pred_labels[topk_inds]
+
+            bboxes = self.bbox_coder.decode(anchors, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_max_scores.append(max_scores)
+            mlvl_cls_score.append(cls_score)
+            mlvl_label_pred.append(pred_labels)
+            mlvl_dir_scores.append(dir_cls_score)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            mlvl_bboxes, box_dim=self.box_code_size).bev)
+        mlvl_max_scores = torch.cat(mlvl_max_scores)
+        mlvl_label_pred = torch.cat(mlvl_label_pred)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+        # shape [k, num_class] before sigmoid
+        # PartA2 need to keep raw classification score
+        # because the bbox head in the second stage does not have
+        # classification branch,
+        # roi head need this score as classification score
+        mlvl_cls_score = torch.cat(mlvl_cls_score)
+
+        score_thr = cfg.get('score_thr', 0)
+        result = self.class_agnostic_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                         mlvl_max_scores, mlvl_label_pred,
+                                         mlvl_cls_score, mlvl_dir_scores,
+                                         score_thr, cfg.nms_post, cfg,
+                                         input_meta)
+
+        return result
+
+    def class_agnostic_nms(self, mlvl_bboxes, mlvl_bboxes_for_nms,
+                           mlvl_max_scores, mlvl_label_pred, mlvl_cls_score,
+                           mlvl_dir_scores, score_thr, max_num, cfg,
+                           input_meta):
+        """Class agnostic nms for single batch.
+
+        Args:
+            mlvl_bboxes (torch.Tensor): Bboxes from Multi-level.
+            mlvl_bboxes_for_nms (torch.Tensor): Bboxes for nms
+                (bev or minmax boxes) from Multi-level.
+            mlvl_max_scores (torch.Tensor): Max scores of Multi-level bbox.
+            mlvl_label_pred (torch.Tensor): Class predictions
+                of Multi-level bbox.
+            mlvl_cls_score (torch.Tensor): Class scores of
+                Multi-level bbox.
+            mlvl_dir_scores (torch.Tensor): Direction scores of
+                Multi-level bbox.
+            score_thr (int): Score threshold.
+            max_num (int): Max number of bboxes after nms.
+            cfg (:obj:`ConfigDict`): Training or testing config.
+            input_meta (dict): Contain pcd and img's meta info.
+
+        Returns:
+            dict: Predictions of single batch. Contain the keys:
+
+                - boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
+                - scores_3d (torch.Tensor): Score of each bbox.
+                - labels_3d (torch.Tensor): Label of each bbox.
+                - cls_preds (torch.Tensor): Class score of each bbox.
+        """
+        bboxes = []
+        scores = []
+        labels = []
+        dir_scores = []
+        cls_scores = []
+        score_thr_inds = mlvl_max_scores > score_thr
+        _scores = mlvl_max_scores[score_thr_inds]
+        _bboxes_for_nms = mlvl_bboxes_for_nms[score_thr_inds, :]
+        if cfg.use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+        selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr)
+
+        _mlvl_bboxes = mlvl_bboxes[score_thr_inds, :]
+        _mlvl_dir_scores = mlvl_dir_scores[score_thr_inds]
+        _mlvl_label_pred = mlvl_label_pred[score_thr_inds]
+        _mlvl_cls_score = mlvl_cls_score[score_thr_inds]
+
+        if len(selected) > 0:
+            bboxes.append(_mlvl_bboxes[selected])
+            scores.append(_scores[selected])
+            labels.append(_mlvl_label_pred[selected])
+            cls_scores.append(_mlvl_cls_score[selected])
+            dir_scores.append(_mlvl_dir_scores[selected])
+            dir_rot = limit_period(bboxes[-1][..., 6] - self.dir_offset,
+                                   self.dir_limit_offset, np.pi)
+            bboxes[-1][..., 6] = (
+                dir_rot + self.dir_offset +
+                np.pi * dir_scores[-1].to(bboxes[-1].dtype))
+
+        if bboxes:
+            bboxes = torch.cat(bboxes, dim=0)
+            scores = torch.cat(scores, dim=0)
+            cls_scores = torch.cat(cls_scores, dim=0)
+            labels = torch.cat(labels, dim=0)
+            if bboxes.shape[0] > max_num:
+                _, inds = scores.sort(descending=True)
+                inds = inds[:max_num]
+                bboxes = bboxes[inds, :]
+                labels = labels[inds]
+                scores = scores[inds]
+                cls_scores = cls_scores[inds]
+            bboxes = input_meta['box_type_3d'](
+                bboxes, box_dim=self.box_code_size)
+            return dict(
+                boxes_3d=bboxes,
+                scores_3d=scores,
+                labels_3d=labels,
+                cls_preds=cls_scores  # raw scores [max_num, cls_num]
+            )
+        else:
+            return dict(
+                boxes_3d=input_meta['box_type_3d'](
+                    mlvl_bboxes.new_zeros([0, self.box_code_size]),
+                    box_dim=self.box_code_size),
+                scores_3d=mlvl_bboxes.new_zeros([0]),
+                labels_3d=mlvl_bboxes.new_zeros([0]),
+                cls_preds=mlvl_bboxes.new_zeros([0, mlvl_cls_score.shape[-1]]))
diff --git a/mmdet3d/models/dense_heads/pgd_head.py b/mmdet3d/models/dense_heads/pgd_head.py
index d9bfadb..1aac8e2 100644
--- a/mmdet3d/models/dense_heads/pgd_head.py
+++ b/mmdet3d/models/dense_heads/pgd_head.py
@@ -1,1229 +1,1229 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from mmcv.cnn import Scale, bias_init_with_prob, normal_init
-from mmcv.runner import force_fp32
-from torch import nn as nn
-from torch.nn import functional as F
-
-from mmdet3d.core import box3d_multiclass_nms, xywhr2xyxyr
-from mmdet3d.core.bbox import points_cam2img, points_img2cam
-from mmdet.core import distance2bbox, multi_apply
-from ..builder import HEADS, build_loss
-from .fcos_mono3d_head import FCOSMono3DHead
-
-
-@HEADS.register_module()
-class PGDHead(FCOSMono3DHead):
-    r"""Anchor-free head used in `PGD <https://arxiv.org/abs/2107.14160>`_.
-
-    Args:
-        use_depth_classifer (bool, optional): Whether to use depth classifier.
-            Defaults to True.
-        use_only_reg_proj (bool, optional): Whether to use only direct
-            regressed depth in the re-projection (to make the network easier
-            to learn). Defaults to False.
-        weight_dim (int, optional): Dimension of the location-aware weight
-            map. Defaults to -1.
-        weight_branch (tuple[tuple[int]], optional): Feature map channels of
-            the convolutional branch for weight map. Defaults to ((256, ), ).
-        depth_branch (tuple[int], optional): Feature map channels of the
-            branch for probabilistic depth estimation. Defaults to (64, ),
-        depth_range (tuple[float], optional): Range of depth estimation.
-            Defaults to (0, 70),
-        depth_unit (int, optional): Unit of depth range division. Defaults to
-            10.
-        division (str, optional): Depth division method. Options include
-            'uniform', 'linear', 'log', 'loguniform'. Defaults to 'uniform'.
-        depth_bins (int, optional): Discrete bins of depth division. Defaults
-            to 8.
-        loss_depth (dict, optional): Depth loss. Defaults to dict(
-            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0).
-        loss_bbox2d (dict, optional): Loss for 2D box estimation. Defaults to
-            dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0).
-        loss_consistency (dict, optional): Consistency loss. Defaults to
-            dict(type='GIoULoss', loss_weight=1.0),
-        pred_velo (bool, optional): Whether to predict velocity. Defaults to
-            False.
-        pred_bbox2d (bool, optional): Whether to predict 2D bounding boxes.
-            Defaults to True.
-        pred_keypoints (bool, optional): Whether to predict keypoints.
-            Defaults to False,
-        bbox_coder (dict, optional): Bounding box coder. Defaults to
-            dict(type='PGDBBoxCoder', base_depths=((28.01, 16.32), ),
-            base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56, 1.6)),
-            code_size=7).
-    """
-
-    def __init__(self,
-                 use_depth_classifier=True,
-                 use_onlyreg_proj=False,
-                 weight_dim=-1,
-                 weight_branch=((256, ), ),
-                 depth_branch=(64, ),
-                 depth_range=(0, 70),
-                 depth_unit=10,
-                 division='uniform',
-                 depth_bins=8,
-                 loss_depth=dict(
-                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
-                 loss_bbox2d=dict(
-                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
-                 loss_consistency=dict(type='GIoULoss', loss_weight=1.0),
-                 pred_bbox2d=True,
-                 pred_keypoints=False,
-                 bbox_coder=dict(
-                     type='PGDBBoxCoder',
-                     base_depths=((28.01, 16.32), ),
-                     base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6),
-                                (3.9, 1.56, 1.6)),
-                     code_size=7),
-                 **kwargs):
-        self.use_depth_classifier = use_depth_classifier
-        self.use_onlyreg_proj = use_onlyreg_proj
-        self.depth_branch = depth_branch
-        self.pred_keypoints = pred_keypoints
-        self.weight_dim = weight_dim
-        self.weight_branch = weight_branch
-        self.weight_out_channels = []
-        for weight_branch_channels in weight_branch:
-            if len(weight_branch_channels) > 0:
-                self.weight_out_channels.append(weight_branch_channels[-1])
-            else:
-                self.weight_out_channels.append(-1)
-        self.depth_range = depth_range
-        self.depth_unit = depth_unit
-        self.division = division
-        if self.division == 'uniform':
-            self.num_depth_cls = int(
-                (depth_range[1] - depth_range[0]) / depth_unit) + 1
-            if self.num_depth_cls != depth_bins:
-                print('Warning: The number of bins computed from ' +
-                      'depth_unit is different from given parameter! ' +
-                      'Depth_unit will be considered with priority in ' +
-                      'Uniform Division.')
-        else:
-            self.num_depth_cls = depth_bins
-        super().__init__(
-            pred_bbox2d=pred_bbox2d, bbox_coder=bbox_coder, **kwargs)
-        self.loss_depth = build_loss(loss_depth)
-        if self.pred_bbox2d:
-            self.loss_bbox2d = build_loss(loss_bbox2d)
-            self.loss_consistency = build_loss(loss_consistency)
-        if self.pred_keypoints:
-            self.kpts_start = 9 if self.pred_velo else 7
-
-    def _init_layers(self):
-        """Initialize layers of the head."""
-        super()._init_layers()
-        if self.pred_bbox2d:
-            self.scale_dim += 1
-        if self.pred_keypoints:
-            self.scale_dim += 1
-        self.scales = nn.ModuleList([
-            nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)])
-            for _ in self.strides
-        ])
-
-    def _init_predictor(self):
-        """Initialize predictor layers of the head."""
-        super()._init_predictor()
-
-        if self.use_depth_classifier:
-            self.conv_depth_cls_prev = self._init_branch(
-                conv_channels=self.depth_branch,
-                conv_strides=(1, ) * len(self.depth_branch))
-            self.conv_depth_cls = nn.Conv2d(self.depth_branch[-1],
-                                            self.num_depth_cls, 1)
-            # Data-agnostic single param lambda for local depth fusion
-            self.fuse_lambda = nn.Parameter(torch.tensor(10e-5))
-
-        if self.weight_dim != -1:
-            self.conv_weight_prevs = nn.ModuleList()
-            self.conv_weights = nn.ModuleList()
-            for i in range(self.weight_dim):
-                weight_branch_channels = self.weight_branch[i]
-                weight_out_channel = self.weight_out_channels[i]
-                if len(weight_branch_channels) > 0:
-                    self.conv_weight_prevs.append(
-                        self._init_branch(
-                            conv_channels=weight_branch_channels,
-                            conv_strides=(1, ) * len(weight_branch_channels)))
-                    self.conv_weights.append(
-                        nn.Conv2d(weight_out_channel, 1, 1))
-                else:
-                    self.conv_weight_prevs.append(None)
-                    self.conv_weights.append(
-                        nn.Conv2d(self.feat_channels, 1, 1))
-
-    def init_weights(self):
-        """Initialize weights of the head.
-
-        We currently still use the customized defined init_weights because the
-        default init of DCN triggered by the init_cfg will init
-        conv_offset.weight, which mistakenly affects the training stability.
-        """
-        super().init_weights()
-
-        bias_cls = bias_init_with_prob(0.01)
-        if self.use_depth_classifier:
-            for m in self.conv_depth_cls_prev:
-                if isinstance(m.conv, nn.Conv2d):
-                    normal_init(m.conv, std=0.01)
-            normal_init(self.conv_depth_cls, std=0.01, bias=bias_cls)
-
-        if self.weight_dim != -1:
-            for conv_weight_prev in self.conv_weight_prevs:
-                if conv_weight_prev is None:
-                    continue
-                for m in conv_weight_prev:
-                    if isinstance(m.conv, nn.Conv2d):
-                        normal_init(m.conv, std=0.01)
-            for conv_weight in self.conv_weights:
-                normal_init(conv_weight, std=0.01)
-
-    def forward(self, feats):
-        """Forward features from the upstream network.
-
-        Args:
-            feats (tuple[Tensor]): Features from the upstream network, each is
-                a 4D-tensor.
-
-        Returns:
-            tuple:
-                cls_scores (list[Tensor]): Box scores for each scale level,
-                    each is a 4D-tensor, the channel number is
-                    num_points * num_classes.
-                bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                    level, each is a 4D-tensor, the channel number is
-                    num_points * bbox_code_size.
-                dir_cls_preds (list[Tensor]): Box scores for direction class
-                    predictions on each scale level, each is a 4D-tensor,
-                    the channel number is num_points * 2. (bin = 2).
-                weight (list[Tensor]): Location-aware weight maps on each
-                    scale level, each is a 4D-tensor, the channel number is
-                    num_points * 1.
-                depth_cls_preds (list[Tensor]): Box scores for depth class
-                    predictions on each scale level, each is a 4D-tensor,
-                    the channel number is num_points * self.num_depth_cls.
-                attr_preds (list[Tensor]): Attribute scores for each scale
-                    level, each is a 4D-tensor, the channel number is
-                    num_points * num_attrs.
-                centernesses (list[Tensor]): Centerness for each scale level,
-                    each is a 4D-tensor, the channel number is num_points * 1.
-        """
-        return multi_apply(self.forward_single, feats, self.scales,
-                           self.strides)
-
-    def forward_single(self, x, scale, stride):
-        """Forward features of a single scale level.
-
-        Args:
-            x (Tensor): FPN feature maps of the specified stride.
-            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
-                the bbox prediction.
-            stride (int): The corresponding stride for feature maps, only
-                used to normalize the bbox prediction when self.norm_on_bbox
-                is True.
-
-        Returns:
-            tuple: scores for each class, bbox and direction class
-                predictions, depth class predictions, location-aware weights,
-                attribute and centerness predictions of input feature maps.
-        """
-        cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, cls_feat, \
-            reg_feat = super().forward_single(x, scale, stride)
-
-        max_regress_range = stride * self.regress_ranges[0][1] / \
-            self.strides[0]
-        bbox_pred = self.bbox_coder.decode_2d(bbox_pred, scale, stride,
-                                              max_regress_range, self.training,
-                                              self.pred_keypoints,
-                                              self.pred_bbox2d)
-
-        depth_cls_pred = None
-        if self.use_depth_classifier:
-            clone_reg_feat = reg_feat.clone()
-            for conv_depth_cls_prev_layer in self.conv_depth_cls_prev:
-                clone_reg_feat = conv_depth_cls_prev_layer(clone_reg_feat)
-            depth_cls_pred = self.conv_depth_cls(clone_reg_feat)
-
-        weight = None
-        if self.weight_dim != -1:
-            weight = []
-            for i in range(self.weight_dim):
-                clone_reg_feat = reg_feat.clone()
-                if len(self.weight_branch[i]) > 0:
-                    for conv_weight_prev_layer in self.conv_weight_prevs[i]:
-                        clone_reg_feat = conv_weight_prev_layer(clone_reg_feat)
-                weight.append(self.conv_weights[i](clone_reg_feat))
-            weight = torch.cat(weight, dim=1)
-
-        return cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \
-            attr_pred, centerness
-
-    def get_proj_bbox2d(self,
-                        bbox_preds,
-                        pos_dir_cls_preds,
-                        labels_3d,
-                        bbox_targets_3d,
-                        pos_points,
-                        pos_inds,
-                        img_metas,
-                        pos_depth_cls_preds=None,
-                        pos_weights=None,
-                        pos_cls_scores=None,
-                        with_kpts=False):
-        """Decode box predictions and get projected 2D attributes.
-
-        Args:
-            bbox_preds (list[Tensor]): Box predictions for each scale
-                level, each is a 4D-tensor, the channel number is
-                num_points * bbox_code_size.
-            pos_dir_cls_preds (Tensor): Box scores for direction class
-                predictions of positive boxes on all the scale levels in shape
-                (num_pos_points, 2).
-            labels_3d (list[Tensor]): 3D box category labels for each scale
-                level, each is a 4D-tensor.
-            bbox_targets_3d (list[Tensor]): 3D box targets for each scale
-                level, each is a 4D-tensor, the channel number is
-                num_points * bbox_code_size.
-            pos_points (Tensor): Foreground points.
-            pos_inds (Tensor): Index of foreground points from flattened
-                tensors.
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            pos_depth_cls_preds (Tensor, optional): Probabilistic depth map of
-                positive boxes on all the scale levels in shape
-                (num_pos_points, self.num_depth_cls). Defaults to None.
-            pos_weights (Tensor, optional): Location-aware weights of positive
-                boxes in shape (num_pos_points, self.weight_dim). Defaults to
-                None.
-            pos_cls_scores (Tensor, optional): Classification scores of
-                positive boxes in shape (num_pos_points, self.num_classes).
-                Defaults to None.
-            with_kpts (bool, optional): Whether to output keypoints targets.
-                Defaults to False.
-
-        Returns:
-            tuple[Tensor]: Exterior 2D boxes from projected 3D boxes,
-                predicted 2D boxes and keypoint targets (if necessary).
-        """
-        views = [np.array(img_meta['cam2img']) for img_meta in img_metas]
-        num_imgs = len(img_metas)
-        img_idx = []
-        for label in labels_3d:
-            for idx in range(num_imgs):
-                img_idx.append(
-                    labels_3d[0].new_ones(int(len(label) / num_imgs)) * idx)
-        img_idx = torch.cat(img_idx)
-        pos_img_idx = img_idx[pos_inds]
-
-        flatten_strided_bbox_preds = []
-        flatten_strided_bbox2d_preds = []
-        flatten_bbox_targets_3d = []
-        flatten_strides = []
-
-        for stride_idx, bbox_pred in enumerate(bbox_preds):
-            flatten_bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(
-                -1, sum(self.group_reg_dims))
-            flatten_bbox_pred[:, :2] *= self.strides[stride_idx]
-            flatten_bbox_pred[:, -4:] *= self.strides[stride_idx]
-            flatten_strided_bbox_preds.append(
-                flatten_bbox_pred[:, :self.bbox_coder.bbox_code_size])
-            flatten_strided_bbox2d_preds.append(flatten_bbox_pred[:, -4:])
-
-            bbox_target_3d = bbox_targets_3d[stride_idx].clone()
-            bbox_target_3d[:, :2] *= self.strides[stride_idx]
-            bbox_target_3d[:, -4:] *= self.strides[stride_idx]
-            flatten_bbox_targets_3d.append(bbox_target_3d)
-
-            flatten_stride = flatten_bbox_pred.new_ones(
-                *flatten_bbox_pred.shape[:-1], 1) * self.strides[stride_idx]
-            flatten_strides.append(flatten_stride)
-
-        flatten_strided_bbox_preds = torch.cat(flatten_strided_bbox_preds)
-        flatten_strided_bbox2d_preds = torch.cat(flatten_strided_bbox2d_preds)
-        flatten_bbox_targets_3d = torch.cat(flatten_bbox_targets_3d)
-        flatten_strides = torch.cat(flatten_strides)
-        pos_strided_bbox_preds = flatten_strided_bbox_preds[pos_inds]
-        pos_strided_bbox2d_preds = flatten_strided_bbox2d_preds[pos_inds]
-        pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
-        pos_strides = flatten_strides[pos_inds]
-
-        pos_decoded_bbox2d_preds = distance2bbox(pos_points,
-                                                 pos_strided_bbox2d_preds)
-
-        pos_strided_bbox_preds[:, :2] = \
-            pos_points - pos_strided_bbox_preds[:, :2]
-        pos_bbox_targets_3d[:, :2] = \
-            pos_points - pos_bbox_targets_3d[:, :2]
-
-        if self.use_depth_classifier and (not self.use_onlyreg_proj):
-            pos_prob_depth_preds = self.bbox_coder.decode_prob_depth(
-                pos_depth_cls_preds, self.depth_range, self.depth_unit,
-                self.division, self.num_depth_cls)
-            sig_alpha = torch.sigmoid(self.fuse_lambda)
-            pos_strided_bbox_preds[:, 2] = \
-                sig_alpha * pos_strided_bbox_preds.clone()[:, 2] + \
-                (1 - sig_alpha) * pos_prob_depth_preds
-
-        box_corners_in_image = pos_strided_bbox_preds.new_zeros(
-            (*pos_strided_bbox_preds.shape[:-1], 8, 2))
-        box_corners_in_image_gt = pos_strided_bbox_preds.new_zeros(
-            (*pos_strided_bbox_preds.shape[:-1], 8, 2))
-
-        for idx in range(num_imgs):
-            mask = (pos_img_idx == idx)
-            if pos_strided_bbox_preds[mask].shape[0] == 0:
-                continue
-            cam2img = torch.eye(
-                4,
-                dtype=pos_strided_bbox_preds.dtype,
-                device=pos_strided_bbox_preds.device)
-            view_shape = views[idx].shape
-            cam2img[:view_shape[0], :view_shape[1]] = \
-                pos_strided_bbox_preds.new_tensor(views[idx])
-
-            centers2d_preds = pos_strided_bbox_preds.clone()[mask, :2]
-            centers2d_targets = pos_bbox_targets_3d.clone()[mask, :2]
-            centers3d_targets = points_img2cam(pos_bbox_targets_3d[mask, :3],
-                                               views[idx])
-
-            # use predicted depth to re-project the 2.5D centers
-            pos_strided_bbox_preds[mask, :3] = points_img2cam(
-                pos_strided_bbox_preds[mask, :3], views[idx])
-            pos_bbox_targets_3d[mask, :3] = centers3d_targets
-
-            # depth fixed when computing re-project 3D bboxes
-            pos_strided_bbox_preds[mask, 2] = \
-                pos_bbox_targets_3d.clone()[mask, 2]
-
-            # decode yaws
-            if self.use_direction_classifier:
-                pos_dir_cls_scores = torch.max(
-                    pos_dir_cls_preds[mask], dim=-1)[1]
-                pos_strided_bbox_preds[mask] = self.bbox_coder.decode_yaw(
-                    pos_strided_bbox_preds[mask], centers2d_preds,
-                    pos_dir_cls_scores, self.dir_offset, cam2img)
-            pos_bbox_targets_3d[mask, 6] = torch.atan2(
-                centers2d_targets[:, 0] - cam2img[0, 2],
-                cam2img[0, 0]) + pos_bbox_targets_3d[mask, 6]
-
-            corners = img_metas[0]['box_type_3d'](
-                pos_strided_bbox_preds[mask],
-                box_dim=self.bbox_coder.bbox_code_size,
-                origin=(0.5, 0.5, 0.5)).corners
-            box_corners_in_image[mask] = points_cam2img(corners, cam2img)
-
-            corners_gt = img_metas[0]['box_type_3d'](
-                pos_bbox_targets_3d[mask, :self.bbox_code_size],
-                box_dim=self.bbox_coder.bbox_code_size,
-                origin=(0.5, 0.5, 0.5)).corners
-            box_corners_in_image_gt[mask] = points_cam2img(corners_gt, cam2img)
-
-        minxy = torch.min(box_corners_in_image, dim=1)[0]
-        maxxy = torch.max(box_corners_in_image, dim=1)[0]
-        proj_bbox2d_preds = torch.cat([minxy, maxxy], dim=1)
-
-        outputs = (proj_bbox2d_preds, pos_decoded_bbox2d_preds)
-
-        if with_kpts:
-            norm_strides = pos_strides * self.regress_ranges[0][1] / \
-                self.strides[0]
-            kpts_targets = box_corners_in_image_gt - pos_points[..., None, :]
-            kpts_targets = kpts_targets.view(
-                (*pos_strided_bbox_preds.shape[:-1], 16))
-            kpts_targets /= norm_strides
-
-            outputs += (kpts_targets, )
-
-        return outputs
-
-    def get_pos_predictions(self, bbox_preds, dir_cls_preds, depth_cls_preds,
-                            weights, attr_preds, centernesses, pos_inds,
-                            img_metas):
-        """Flatten predictions and get positive ones.
-
-        Args:
-            bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                level, each is a 4D-tensor, the channel number is
-                num_points * bbox_code_size.
-            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * 2. (bin = 2)
-            depth_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * self.num_depth_cls.
-            attr_preds (list[Tensor]): Attribute scores for each scale level,
-                each is a 4D-tensor, the channel number is
-                num_points * num_attrs.
-            centernesses (list[Tensor]): Centerness for each scale level, each
-                is a 4D-tensor, the channel number is num_points * 1.
-            pos_inds (Tensor): Index of foreground points from flattened
-                tensors.
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-
-        Returns:
-            tuple[Tensor]: Box predictions, direction classes, probabilistic
-                depth maps, location-aware weight maps, attributes and
-                centerness predictions.
-        """
-        flatten_bbox_preds = [
-            bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims))
-            for bbox_pred in bbox_preds
-        ]
-        flatten_dir_cls_preds = [
-            dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2)
-            for dir_cls_pred in dir_cls_preds
-        ]
-        flatten_centerness = [
-            centerness.permute(0, 2, 3, 1).reshape(-1)
-            for centerness in centernesses
-        ]
-        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
-        flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds)
-        flatten_centerness = torch.cat(flatten_centerness)
-        pos_bbox_preds = flatten_bbox_preds[pos_inds]
-        pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds]
-        pos_centerness = flatten_centerness[pos_inds]
-
-        pos_depth_cls_preds = None
-        if self.use_depth_classifier:
-            flatten_depth_cls_preds = [
-                depth_cls_pred.permute(0, 2, 3,
-                                       1).reshape(-1, self.num_depth_cls)
-                for depth_cls_pred in depth_cls_preds
-            ]
-            flatten_depth_cls_preds = torch.cat(flatten_depth_cls_preds)
-            pos_depth_cls_preds = flatten_depth_cls_preds[pos_inds]
-
-        pos_weights = None
-        if self.weight_dim != -1:
-            flatten_weights = [
-                weight.permute(0, 2, 3, 1).reshape(-1, self.weight_dim)
-                for weight in weights
-            ]
-            flatten_weights = torch.cat(flatten_weights)
-            pos_weights = flatten_weights[pos_inds]
-
-        pos_attr_preds = None
-        if self.pred_attrs:
-            flatten_attr_preds = [
-                attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs)
-                for attr_pred in attr_preds
-            ]
-            flatten_attr_preds = torch.cat(flatten_attr_preds)
-            pos_attr_preds = flatten_attr_preds[pos_inds]
-
-        return pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, \
-            pos_weights, pos_attr_preds, pos_centerness
-
-    @force_fp32(
-        apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds',
-                  'depth_cls_preds', 'weights', 'attr_preds', 'centernesses'))
-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             dir_cls_preds,
-             depth_cls_preds,
-             weights,
-             attr_preds,
-             centernesses,
-             gt_bboxes,
-             gt_labels,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             centers2d,
-             depths,
-             attr_labels,
-             img_metas,
-             gt_bboxes_ignore=None):
-        """Compute loss of the head.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level,
-                each is a 4D-tensor, the channel number is
-                num_points * num_classes.
-            bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                level, each is a 4D-tensor, the channel number is
-                num_points * bbox_code_size.
-            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * 2. (bin = 2)
-            depth_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * self.num_depth_cls.
-            weights (list[Tensor]): Location-aware weights for each scale
-                level, each is a 4D-tensor, the channel number is
-                num_points * self.weight_dim.
-            attr_preds (list[Tensor]): Attribute scores for each scale level,
-                each is a 4D-tensor, the channel number is
-                num_points * num_attrs.
-            centernesses (list[Tensor]): Centerness for each scale level, each
-                is a 4D-tensor, the channel number is num_points * 1.
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[Tensor]): class indices corresponding to each box
-            gt_bboxes_3d (list[Tensor]): 3D boxes ground truth with shape of
-                (num_gts, code_size).
-            gt_labels_3d (list[Tensor]): same as gt_labels
-            centers2d (list[Tensor]): 2D centers on the image with shape of
-                (num_gts, 2).
-            depths (list[Tensor]): Depth ground truth with shape of
-                (num_gts, ).
-            attr_labels (list[Tensor]): Attributes indices of each box.
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can
-                be ignored when computing the loss. Defaults to None.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-        assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
-            len(depth_cls_preds) == len(weights) == len(centernesses) == \
-            len(attr_preds), 'The length of cls_scores, bbox_preds, ' \
-            'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \
-            f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \
-            f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \
-            f'{len(centernesses)}, {len(attr_preds)} are inconsistent.'
-        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
-        all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
-                                           bbox_preds[0].device)
-        labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \
-            self.get_targets(
-                all_level_points, gt_bboxes, gt_labels, gt_bboxes_3d,
-                gt_labels_3d, centers2d, depths, attr_labels)
-
-        num_imgs = cls_scores[0].size(0)
-        # flatten cls_scores and targets
-        flatten_cls_scores = [
-            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
-            for cls_score in cls_scores
-        ]
-        flatten_cls_scores = torch.cat(flatten_cls_scores)
-        flatten_labels_3d = torch.cat(labels_3d)
-        flatten_bbox_targets_3d = torch.cat(bbox_targets_3d)
-        flatten_centerness_targets = torch.cat(centerness_targets)
-        flatten_points = torch.cat(
-            [points.repeat(num_imgs, 1) for points in all_level_points])
-        if self.pred_attrs:
-            flatten_attr_targets = torch.cat(attr_targets)
-
-        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
-        bg_class_ind = self.num_classes
-        pos_inds = ((flatten_labels_3d >= 0)
-                    & (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1)
-        num_pos = len(pos_inds)
-
-        loss_dict = dict()
-
-        loss_dict['loss_cls'] = self.loss_cls(
-            flatten_cls_scores,
-            flatten_labels_3d,
-            avg_factor=num_pos + num_imgs)  # avoid num_pos is 0
-
-        pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, pos_weights, \
-            pos_attr_preds, pos_centerness = self.get_pos_predictions(
-                bbox_preds, dir_cls_preds, depth_cls_preds, weights,
-                attr_preds, centernesses, pos_inds, img_metas)
-
-        if num_pos > 0:
-            pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
-            pos_centerness_targets = flatten_centerness_targets[pos_inds]
-            pos_points = flatten_points[pos_inds]
-            if self.pred_attrs:
-                pos_attr_targets = flatten_attr_targets[pos_inds]
-            if self.use_direction_classifier:
-                pos_dir_cls_targets = self.get_direction_target(
-                    pos_bbox_targets_3d, self.dir_offset, one_hot=False)
-
-            bbox_weights = pos_centerness_targets.new_ones(
-                len(pos_centerness_targets), sum(self.group_reg_dims))
-            equal_weights = pos_centerness_targets.new_ones(
-                pos_centerness_targets.shape)
-            code_weight = self.train_cfg.get('code_weight', None)
-            if code_weight:
-                assert len(code_weight) == sum(self.group_reg_dims)
-                bbox_weights = bbox_weights * bbox_weights.new_tensor(
-                    code_weight)
-
-            if self.diff_rad_by_sin:
-                pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference(
-                    pos_bbox_preds, pos_bbox_targets_3d)
-
-            loss_dict['loss_offset'] = self.loss_bbox(
-                pos_bbox_preds[:, :2],
-                pos_bbox_targets_3d[:, :2],
-                weight=bbox_weights[:, :2],
-                avg_factor=equal_weights.sum())
-            loss_dict['loss_size'] = self.loss_bbox(
-                pos_bbox_preds[:, 3:6],
-                pos_bbox_targets_3d[:, 3:6],
-                weight=bbox_weights[:, 3:6],
-                avg_factor=equal_weights.sum())
-            loss_dict['loss_rotsin'] = self.loss_bbox(
-                pos_bbox_preds[:, 6],
-                pos_bbox_targets_3d[:, 6],
-                weight=bbox_weights[:, 6],
-                avg_factor=equal_weights.sum())
-            if self.pred_velo:
-                loss_dict['loss_velo'] = self.loss_bbox(
-                    pos_bbox_preds[:, 7:9],
-                    pos_bbox_targets_3d[:, 7:9],
-                    weight=bbox_weights[:, 7:9],
-                    avg_factor=equal_weights.sum())
-
-            proj_bbox2d_inputs = (bbox_preds, pos_dir_cls_preds, labels_3d,
-                                  bbox_targets_3d, pos_points, pos_inds,
-                                  img_metas)
-
-            # direction classification loss
-            # TODO: add more check for use_direction_classifier
-            if self.use_direction_classifier:
-                loss_dict['loss_dir'] = self.loss_dir(
-                    pos_dir_cls_preds,
-                    pos_dir_cls_targets,
-                    equal_weights,
-                    avg_factor=equal_weights.sum())
-
-            # init depth loss with the one computed from direct regression
-            loss_dict['loss_depth'] = self.loss_bbox(
-                pos_bbox_preds[:, 2],
-                pos_bbox_targets_3d[:, 2],
-                weight=bbox_weights[:, 2],
-                avg_factor=equal_weights.sum())
-            # depth classification loss
-            if self.use_depth_classifier:
-                pos_prob_depth_preds = self.bbox_coder.decode_prob_depth(
-                    pos_depth_cls_preds, self.depth_range, self.depth_unit,
-                    self.division, self.num_depth_cls)
-                sig_alpha = torch.sigmoid(self.fuse_lambda)
-                if self.weight_dim != -1:
-                    loss_fuse_depth = self.loss_depth(
-                        sig_alpha * pos_bbox_preds[:, 2] +
-                        (1 - sig_alpha) * pos_prob_depth_preds,
-                        pos_bbox_targets_3d[:, 2],
-                        sigma=pos_weights[:, 0],
-                        weight=bbox_weights[:, 2],
-                        avg_factor=equal_weights.sum())
-                else:
-                    loss_fuse_depth = self.loss_depth(
-                        sig_alpha * pos_bbox_preds[:, 2] +
-                        (1 - sig_alpha) * pos_prob_depth_preds,
-                        pos_bbox_targets_3d[:, 2],
-                        weight=bbox_weights[:, 2],
-                        avg_factor=equal_weights.sum())
-                loss_dict['loss_depth'] = loss_fuse_depth
-
-                proj_bbox2d_inputs += (pos_depth_cls_preds, )
-
-            if self.pred_keypoints:
-                # use smoothL1 to compute consistency loss for keypoints
-                # normalize the offsets with strides
-                proj_bbox2d_preds, pos_decoded_bbox2d_preds, kpts_targets = \
-                    self.get_proj_bbox2d(*proj_bbox2d_inputs, with_kpts=True)
-                loss_dict['loss_kpts'] = self.loss_bbox(
-                    pos_bbox_preds[:, self.kpts_start:self.kpts_start + 16],
-                    kpts_targets,
-                    weight=bbox_weights[:,
-                                        self.kpts_start:self.kpts_start + 16],
-                    avg_factor=equal_weights.sum())
-
-            if self.pred_bbox2d:
-                loss_dict['loss_bbox2d'] = self.loss_bbox2d(
-                    pos_bbox_preds[:, -4:],
-                    pos_bbox_targets_3d[:, -4:],
-                    weight=bbox_weights[:, -4:],
-                    avg_factor=equal_weights.sum())
-                if not self.pred_keypoints:
-                    proj_bbox2d_preds, pos_decoded_bbox2d_preds = \
-                        self.get_proj_bbox2d(*proj_bbox2d_inputs)
-                loss_dict['loss_consistency'] = self.loss_consistency(
-                    proj_bbox2d_preds,
-                    pos_decoded_bbox2d_preds,
-                    weight=bbox_weights[:, -4:],
-                    avg_factor=equal_weights.sum())
-
-            loss_dict['loss_centerness'] = self.loss_centerness(
-                pos_centerness, pos_centerness_targets)
-
-            # attribute classification loss
-            if self.pred_attrs:
-                loss_dict['loss_attr'] = self.loss_attr(
-                    pos_attr_preds,
-                    pos_attr_targets,
-                    pos_centerness_targets,
-                    avg_factor=pos_centerness_targets.sum())
-
-        else:
-            # need absolute due to possible negative delta x/y
-            loss_dict['loss_offset'] = pos_bbox_preds[:, :2].sum()
-            loss_dict['loss_size'] = pos_bbox_preds[:, 3:6].sum()
-            loss_dict['loss_rotsin'] = pos_bbox_preds[:, 6].sum()
-            loss_dict['loss_depth'] = pos_bbox_preds[:, 2].sum()
-            if self.pred_velo:
-                loss_dict['loss_velo'] = pos_bbox_preds[:, 7:9].sum()
-            if self.pred_keypoints:
-                loss_dict['loss_kpts'] = pos_bbox_preds[:,
-                                                        self.kpts_start:self.
-                                                        kpts_start + 16].sum()
-            if self.pred_bbox2d:
-                loss_dict['loss_bbox2d'] = pos_bbox_preds[:, -4:].sum()
-                loss_dict['loss_consistency'] = pos_bbox_preds[:, -4:].sum()
-            loss_dict['loss_centerness'] = pos_centerness.sum()
-            if self.use_direction_classifier:
-                loss_dict['loss_dir'] = pos_dir_cls_preds.sum()
-            if self.use_depth_classifier:
-                sig_alpha = torch.sigmoid(self.fuse_lambda)
-                loss_fuse_depth = \
-                    sig_alpha * pos_bbox_preds[:, 2].sum() + \
-                    (1 - sig_alpha) * pos_depth_cls_preds.sum()
-                if self.weight_dim != -1:
-                    loss_fuse_depth *= torch.exp(-pos_weights[:, 0].sum())
-                loss_dict['loss_depth'] = loss_fuse_depth
-            if self.pred_attrs:
-                loss_dict['loss_attr'] = pos_attr_preds.sum()
-
-        return loss_dict
-
-    @force_fp32(
-        apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds',
-                  'depth_cls_preds', 'weights', 'attr_preds', 'centernesses'))
-    def get_bboxes(self,
-                   cls_scores,
-                   bbox_preds,
-                   dir_cls_preds,
-                   depth_cls_preds,
-                   weights,
-                   attr_preds,
-                   centernesses,
-                   img_metas,
-                   cfg=None,
-                   rescale=None):
-        """Transform network output for a batch into bbox predictions.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level
-                Has shape (N, num_points * num_classes, H, W)
-            bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                level with shape (N, num_points * 4, H, W)
-            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * 2. (bin = 2)
-            depth_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on each scale level, each is a 4D-tensor,
-                the channel number is num_points * self.num_depth_cls.
-            weights (list[Tensor]): Location-aware weights for each scale
-                level, each is a 4D-tensor, the channel number is
-                num_points * self.weight_dim.
-            attr_preds (list[Tensor]): Attribute scores for each scale level
-                Has shape (N, num_points * num_attrs, H, W)
-            centernesses (list[Tensor]): Centerness for each scale level with
-                shape (N, num_points * 1, H, W)
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            cfg (mmcv.Config, optional): Test / postprocessing configuration,
-                if None, test_cfg would be used. Defaults to None.
-            rescale (bool, optional): If True, return boxes in original image
-                space. Defaults to None.
-
-        Returns:
-            list[tuple[Tensor]]: Each item in result_list is a tuple, which
-                consists of predicted 3D boxes, scores, labels, attributes and
-                2D boxes (if necessary).
-        """
-        assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
-            len(depth_cls_preds) == len(weights) == len(centernesses) == \
-            len(attr_preds), 'The length of cls_scores, bbox_preds, ' \
-            'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \
-            f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \
-            f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \
-            f'{len(centernesses)}, {len(attr_preds)} are inconsistent.'
-        num_levels = len(cls_scores)
-
-        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
-        mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
-                                      bbox_preds[0].device)
-        result_list = []
-        for img_id in range(len(img_metas)):
-            cls_score_list = [
-                cls_scores[i][img_id].detach() for i in range(num_levels)
-            ]
-            bbox_pred_list = [
-                bbox_preds[i][img_id].detach() for i in range(num_levels)
-            ]
-            if self.use_direction_classifier:
-                dir_cls_pred_list = [
-                    dir_cls_preds[i][img_id].detach()
-                    for i in range(num_levels)
-                ]
-            else:
-                dir_cls_pred_list = [
-                    cls_scores[i][img_id].new_full(
-                        [2, *cls_scores[i][img_id].shape[1:]], 0).detach()
-                    for i in range(num_levels)
-                ]
-            if self.use_depth_classifier:
-                depth_cls_pred_list = [
-                    depth_cls_preds[i][img_id].detach()
-                    for i in range(num_levels)
-                ]
-            else:
-                depth_cls_pred_list = [
-                    cls_scores[i][img_id].new_full(
-                        [self.num_depth_cls, *cls_scores[i][img_id].shape[1:]],
-                        0).detach() for i in range(num_levels)
-                ]
-            if self.weight_dim != -1:
-                weight_list = [
-                    weights[i][img_id].detach() for i in range(num_levels)
-                ]
-            else:
-                weight_list = [
-                    cls_scores[i][img_id].new_full(
-                        [1, *cls_scores[i][img_id].shape[1:]], 0).detach()
-                    for i in range(num_levels)
-                ]
-            if self.pred_attrs:
-                attr_pred_list = [
-                    attr_preds[i][img_id].detach() for i in range(num_levels)
-                ]
-            else:
-                attr_pred_list = [
-                    cls_scores[i][img_id].new_full(
-                        [self.num_attrs, *cls_scores[i][img_id].shape[1:]],
-                        self.attr_background_label).detach()
-                    for i in range(num_levels)
-                ]
-            centerness_pred_list = [
-                centernesses[i][img_id].detach() for i in range(num_levels)
-            ]
-            input_meta = img_metas[img_id]
-            det_bboxes = self._get_bboxes_single(
-                cls_score_list, bbox_pred_list, dir_cls_pred_list,
-                depth_cls_pred_list, weight_list, attr_pred_list,
-                centerness_pred_list, mlvl_points, input_meta, cfg, rescale)
-            result_list.append(det_bboxes)
-        return result_list
-
-    def _get_bboxes_single(self,
-                           cls_scores,
-                           bbox_preds,
-                           dir_cls_preds,
-                           depth_cls_preds,
-                           weights,
-                           attr_preds,
-                           centernesses,
-                           mlvl_points,
-                           input_meta,
-                           cfg,
-                           rescale=False):
-        """Transform outputs for a single batch item into bbox predictions.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for a single scale level
-                Has shape (num_points * num_classes, H, W).
-            bbox_preds (list[Tensor]): Box energies / deltas for a single scale
-                level with shape (num_points * bbox_code_size, H, W).
-            dir_cls_preds (list[Tensor]): Box scores for direction class
-                predictions on a single scale level with shape
-                (num_points * 2, H, W)
-            depth_cls_preds (list[Tensor]): Box scores for probabilistic depth
-                predictions on a single scale level with shape
-                (num_points * self.num_depth_cls, H, W)
-            weights (list[Tensor]): Location-aware weight maps on a single
-                scale level with shape (num_points * self.weight_dim, H, W).
-            attr_preds (list[Tensor]): Attribute scores for each scale level
-                Has shape (N, num_points * num_attrs, H, W)
-            centernesses (list[Tensor]): Centerness for a single scale level
-                with shape (num_points, H, W).
-            mlvl_points (list[Tensor]): Box reference for a single scale level
-                with shape (num_total_points, 2).
-            input_meta (dict): Metadata of input image.
-            cfg (mmcv.Config): Test / postprocessing configuration,
-                if None, test_cfg would be used.
-            rescale (bool, optional): If True, return boxes in original image
-                space. Defaults to False.
-
-        Returns:
-            tuples[Tensor]: Predicted 3D boxes, scores, labels, attributes and
-                2D boxes (if necessary).
-        """
-        view = np.array(input_meta['cam2img'])
-        scale_factor = input_meta['scale_factor']
-        cfg = self.test_cfg if cfg is None else cfg
-        assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
-        mlvl_centers2d = []
-        mlvl_bboxes = []
-        mlvl_scores = []
-        mlvl_dir_scores = []
-        mlvl_attr_scores = []
-        mlvl_centerness = []
-        mlvl_depth_cls_scores = []
-        mlvl_depth_uncertainty = []
-        mlvl_bboxes2d = None
-        if self.pred_bbox2d:
-            mlvl_bboxes2d = []
-
-        for cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \
-                attr_pred, centerness, points in zip(
-                    cls_scores, bbox_preds, dir_cls_preds, depth_cls_preds,
-                    weights, attr_preds, centernesses, mlvl_points):
-            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
-            scores = cls_score.permute(1, 2, 0).reshape(
-                -1, self.cls_out_channels).sigmoid()
-            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
-            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
-            depth_cls_pred = depth_cls_pred.permute(1, 2, 0).reshape(
-                -1, self.num_depth_cls)
-            depth_cls_score = F.softmax(
-                depth_cls_pred, dim=-1).topk(
-                    k=2, dim=-1)[0].mean(dim=-1)
-            if self.weight_dim != -1:
-                weight = weight.permute(1, 2, 0).reshape(-1, self.weight_dim)
-            else:
-                weight = weight.permute(1, 2, 0).reshape(-1, 1)
-            depth_uncertainty = torch.exp(-weight[:, -1])
-            attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs)
-            attr_score = torch.max(attr_pred, dim=-1)[1]
-            centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
-
-            bbox_pred = bbox_pred.permute(1, 2,
-                                          0).reshape(-1,
-                                                     sum(self.group_reg_dims))
-            bbox_pred3d = bbox_pred[:, :self.bbox_coder.bbox_code_size]
-            if self.pred_bbox2d:
-                bbox_pred2d = bbox_pred[:, -4:]
-            nms_pre = cfg.get('nms_pre', -1)
-            if nms_pre > 0 and scores.shape[0] > nms_pre:
-                merged_scores = scores * centerness[:, None]
-                if self.use_depth_classifier:
-                    merged_scores *= depth_cls_score[:, None]
-                    if self.weight_dim != -1:
-                        merged_scores *= depth_uncertainty[:, None]
-                max_scores, _ = merged_scores.max(dim=1)
-                _, topk_inds = max_scores.topk(nms_pre)
-                points = points[topk_inds, :]
-                bbox_pred3d = bbox_pred3d[topk_inds, :]
-                scores = scores[topk_inds, :]
-                dir_cls_pred = dir_cls_pred[topk_inds, :]
-                depth_cls_pred = depth_cls_pred[topk_inds, :]
-                centerness = centerness[topk_inds]
-                dir_cls_score = dir_cls_score[topk_inds]
-                depth_cls_score = depth_cls_score[topk_inds]
-                depth_uncertainty = depth_uncertainty[topk_inds]
-                attr_score = attr_score[topk_inds]
-                if self.pred_bbox2d:
-                    bbox_pred2d = bbox_pred2d[topk_inds, :]
-            # change the offset to actual center predictions
-            bbox_pred3d[:, :2] = points - bbox_pred3d[:, :2]
-            if rescale:
-                bbox_pred3d[:, :2] /= bbox_pred3d[:, :2].new_tensor(
-                    scale_factor)
-                if self.pred_bbox2d:
-                    bbox_pred2d /= bbox_pred2d.new_tensor(scale_factor)
-            if self.use_depth_classifier:
-                prob_depth_pred = self.bbox_coder.decode_prob_depth(
-                    depth_cls_pred, self.depth_range, self.depth_unit,
-                    self.division, self.num_depth_cls)
-                sig_alpha = torch.sigmoid(self.fuse_lambda)
-                bbox_pred3d[:, 2] = sig_alpha * bbox_pred3d[:, 2] + \
-                    (1 - sig_alpha) * prob_depth_pred
-            pred_center2d = bbox_pred3d[:, :3].clone()
-            bbox_pred3d[:, :3] = points_img2cam(bbox_pred3d[:, :3], view)
-            mlvl_centers2d.append(pred_center2d)
-            mlvl_bboxes.append(bbox_pred3d)
-            mlvl_scores.append(scores)
-            mlvl_dir_scores.append(dir_cls_score)
-            mlvl_depth_cls_scores.append(depth_cls_score)
-            mlvl_attr_scores.append(attr_score)
-            mlvl_centerness.append(centerness)
-            mlvl_depth_uncertainty.append(depth_uncertainty)
-            if self.pred_bbox2d:
-                bbox_pred2d = distance2bbox(
-                    points, bbox_pred2d, max_shape=input_meta['img_shape'])
-                mlvl_bboxes2d.append(bbox_pred2d)
-
-        mlvl_centers2d = torch.cat(mlvl_centers2d)
-        mlvl_bboxes = torch.cat(mlvl_bboxes)
-        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
-        if self.pred_bbox2d:
-            mlvl_bboxes2d = torch.cat(mlvl_bboxes2d)
-
-        # change local yaw to global yaw for 3D nms
-        cam2img = torch.eye(
-            4, dtype=mlvl_centers2d.dtype, device=mlvl_centers2d.device)
-        cam2img[:view.shape[0], :view.shape[1]] = \
-            mlvl_centers2d.new_tensor(view)
-        mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d,
-                                                 mlvl_dir_scores,
-                                                 self.dir_offset, cam2img)
-
-        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
-            mlvl_bboxes,
-            box_dim=self.bbox_coder.bbox_code_size,
-            origin=(0.5, 0.5, 0.5)).bev)
-
-        mlvl_scores = torch.cat(mlvl_scores)
-        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
-        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
-        # BG cat_id: num_class
-        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
-        mlvl_attr_scores = torch.cat(mlvl_attr_scores)
-        mlvl_centerness = torch.cat(mlvl_centerness)
-        # no scale_factors in box3d_multiclass_nms
-        # Then we multiply it from outside
-        mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None]
-        if self.use_depth_classifier:  # multiply the depth confidence
-            mlvl_depth_cls_scores = torch.cat(mlvl_depth_cls_scores)
-            mlvl_nms_scores *= mlvl_depth_cls_scores[:, None]
-            if self.weight_dim != -1:
-                mlvl_depth_uncertainty = torch.cat(mlvl_depth_uncertainty)
-                mlvl_nms_scores *= mlvl_depth_uncertainty[:, None]
-        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
-                                       mlvl_nms_scores, cfg.score_thr,
-                                       cfg.max_per_img, cfg, mlvl_dir_scores,
-                                       mlvl_attr_scores, mlvl_bboxes2d)
-        bboxes, scores, labels, dir_scores, attrs = results[0:5]
-        attrs = attrs.to(labels.dtype)  # change data type to int
-        bboxes = input_meta['box_type_3d'](
-            bboxes,
-            box_dim=self.bbox_coder.bbox_code_size,
-            origin=(0.5, 0.5, 0.5))
-        # Note that the predictions use origin (0.5, 0.5, 0.5)
-        # Due to the ground truth centers2d are the gravity center of objects
-        # v0.10.0 fix inplace operation to the input tensor of cam_box3d
-        # So here we also need to add origin=(0.5, 0.5, 0.5)
-        if not self.pred_attrs:
-            attrs = None
-
-        outputs = (bboxes, scores, labels, attrs)
-        if self.pred_bbox2d:
-            bboxes2d = results[-1]
-            bboxes2d = torch.cat([bboxes2d, scores[:, None]], dim=1)
-            outputs = outputs + (bboxes2d, )
-
-        return outputs
-
-    def get_targets(self, points, gt_bboxes_list, gt_labels_list,
-                    gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
-                    depths_list, attr_labels_list):
-        """Compute regression, classification and centerss targets for points
-        in multiple images.
-
-        Args:
-            points (list[Tensor]): Points of each fpn level, each has shape
-                (num_points, 2).
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
-                each has shape (num_gt, 4).
-            gt_labels_list (list[Tensor]): Ground truth labels of each box,
-                each has shape (num_gt,).
-            gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
-                image, each has shape (num_gt, bbox_code_size).
-            gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
-                box, each has shape (num_gt,).
-            centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
-                each has shape (num_gt, 2).
-            depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
-                image, each has shape (num_gt, 1).
-            attr_labels_list (list[Tensor]): Attribute labels of each box,
-                each has shape (num_gt,).
-
-        Returns:
-            tuple:
-                concat_lvl_labels (list[Tensor]): Labels of each level. \
-                concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
-                    level.
-        """
-        assert len(points) == len(self.regress_ranges)
-        num_levels = len(points)
-        # expand regress ranges to align with points
-        expanded_regress_ranges = [
-            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
-                points[i]) for i in range(num_levels)
-        ]
-        # concat all levels points and regress ranges
-        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
-        concat_points = torch.cat(points, dim=0)
-
-        # the number of points per img, per lvl
-        num_points = [center.size(0) for center in points]
-
-        if attr_labels_list is None:
-            attr_labels_list = [
-                gt_labels.new_full(gt_labels.shape, self.attr_background_label)
-                for gt_labels in gt_labels_list
-            ]
-
-        # get labels and bbox_targets of each image
-        _, bbox_targets_list, labels_3d_list, bbox_targets_3d_list, \
-            centerness_targets_list, attr_targets_list = multi_apply(
-                self._get_target_single,
-                gt_bboxes_list,
-                gt_labels_list,
-                gt_bboxes_3d_list,
-                gt_labels_3d_list,
-                centers2d_list,
-                depths_list,
-                attr_labels_list,
-                points=concat_points,
-                regress_ranges=concat_regress_ranges,
-                num_points_per_lvl=num_points)
-
-        # split to per img, per level
-        bbox_targets_list = [
-            bbox_targets.split(num_points, 0)
-            for bbox_targets in bbox_targets_list
-        ]
-        labels_3d_list = [
-            labels_3d.split(num_points, 0) for labels_3d in labels_3d_list
-        ]
-        bbox_targets_3d_list = [
-            bbox_targets_3d.split(num_points, 0)
-            for bbox_targets_3d in bbox_targets_3d_list
-        ]
-        centerness_targets_list = [
-            centerness_targets.split(num_points, 0)
-            for centerness_targets in centerness_targets_list
-        ]
-        attr_targets_list = [
-            attr_targets.split(num_points, 0)
-            for attr_targets in attr_targets_list
-        ]
-
-        # concat per level image
-        concat_lvl_labels_3d = []
-        concat_lvl_bbox_targets_3d = []
-        concat_lvl_centerness_targets = []
-        concat_lvl_attr_targets = []
-        for i in range(num_levels):
-            concat_lvl_labels_3d.append(
-                torch.cat([labels[i] for labels in labels_3d_list]))
-            concat_lvl_centerness_targets.append(
-                torch.cat([
-                    centerness_targets[i]
-                    for centerness_targets in centerness_targets_list
-                ]))
-            bbox_targets_3d = torch.cat([
-                bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list
-            ])
-            if self.pred_bbox2d:
-                bbox_targets = torch.cat(
-                    [bbox_targets[i] for bbox_targets in bbox_targets_list])
-                bbox_targets_3d = torch.cat([bbox_targets_3d, bbox_targets],
-                                            dim=1)
-            concat_lvl_attr_targets.append(
-                torch.cat(
-                    [attr_targets[i] for attr_targets in attr_targets_list]))
-            if self.norm_on_bbox:
-                bbox_targets_3d[:, :2] = \
-                    bbox_targets_3d[:, :2] / self.strides[i]
-                if self.pred_bbox2d:
-                    bbox_targets_3d[:, -4:] = \
-                        bbox_targets_3d[:, -4:] / self.strides[i]
-            concat_lvl_bbox_targets_3d.append(bbox_targets_3d)
-        return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \
-            concat_lvl_centerness_targets, concat_lvl_attr_targets
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.cnn import Scale, bias_init_with_prob, normal_init
+from mmcv.runner import force_fp32
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.core import box3d_multiclass_nms, xywhr2xyxyr
+from mmdet3d.core.bbox import points_cam2img, points_img2cam
+from mmdet.core import distance2bbox, multi_apply
+from ..builder import HEADS, build_loss
+from .fcos_mono3d_head import FCOSMono3DHead
+
+
+@HEADS.register_module()
+class PGDHead(FCOSMono3DHead):
+    r"""Anchor-free head used in `PGD <https://arxiv.org/abs/2107.14160>`_.
+
+    Args:
+        use_depth_classifer (bool, optional): Whether to use depth classifier.
+            Defaults to True.
+        use_only_reg_proj (bool, optional): Whether to use only direct
+            regressed depth in the re-projection (to make the network easier
+            to learn). Defaults to False.
+        weight_dim (int, optional): Dimension of the location-aware weight
+            map. Defaults to -1.
+        weight_branch (tuple[tuple[int]], optional): Feature map channels of
+            the convolutional branch for weight map. Defaults to ((256, ), ).
+        depth_branch (tuple[int], optional): Feature map channels of the
+            branch for probabilistic depth estimation. Defaults to (64, ),
+        depth_range (tuple[float], optional): Range of depth estimation.
+            Defaults to (0, 70),
+        depth_unit (int, optional): Unit of depth range division. Defaults to
+            10.
+        division (str, optional): Depth division method. Options include
+            'uniform', 'linear', 'log', 'loguniform'. Defaults to 'uniform'.
+        depth_bins (int, optional): Discrete bins of depth division. Defaults
+            to 8.
+        loss_depth (dict, optional): Depth loss. Defaults to dict(
+            type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0).
+        loss_bbox2d (dict, optional): Loss for 2D box estimation. Defaults to
+            dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0).
+        loss_consistency (dict, optional): Consistency loss. Defaults to
+            dict(type='GIoULoss', loss_weight=1.0),
+        pred_velo (bool, optional): Whether to predict velocity. Defaults to
+            False.
+        pred_bbox2d (bool, optional): Whether to predict 2D bounding boxes.
+            Defaults to True.
+        pred_keypoints (bool, optional): Whether to predict keypoints.
+            Defaults to False,
+        bbox_coder (dict, optional): Bounding box coder. Defaults to
+            dict(type='PGDBBoxCoder', base_depths=((28.01, 16.32), ),
+            base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56, 1.6)),
+            code_size=7).
+    """
+
+    def __init__(self,
+                 use_depth_classifier=True,
+                 use_onlyreg_proj=False,
+                 weight_dim=-1,
+                 weight_branch=((256, ), ),
+                 depth_branch=(64, ),
+                 depth_range=(0, 70),
+                 depth_unit=10,
+                 division='uniform',
+                 depth_bins=8,
+                 loss_depth=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 loss_bbox2d=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 loss_consistency=dict(type='GIoULoss', loss_weight=1.0),
+                 pred_bbox2d=True,
+                 pred_keypoints=False,
+                 bbox_coder=dict(
+                     type='PGDBBoxCoder',
+                     base_depths=((28.01, 16.32), ),
+                     base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6),
+                                (3.9, 1.56, 1.6)),
+                     code_size=7),
+                 **kwargs):
+        self.use_depth_classifier = use_depth_classifier
+        self.use_onlyreg_proj = use_onlyreg_proj
+        self.depth_branch = depth_branch
+        self.pred_keypoints = pred_keypoints
+        self.weight_dim = weight_dim
+        self.weight_branch = weight_branch
+        self.weight_out_channels = []
+        for weight_branch_channels in weight_branch:
+            if len(weight_branch_channels) > 0:
+                self.weight_out_channels.append(weight_branch_channels[-1])
+            else:
+                self.weight_out_channels.append(-1)
+        self.depth_range = depth_range
+        self.depth_unit = depth_unit
+        self.division = division
+        if self.division == 'uniform':
+            self.num_depth_cls = int(
+                (depth_range[1] - depth_range[0]) / depth_unit) + 1
+            if self.num_depth_cls != depth_bins:
+                print('Warning: The number of bins computed from ' +
+                      'depth_unit is different from given parameter! ' +
+                      'Depth_unit will be considered with priority in ' +
+                      'Uniform Division.')
+        else:
+            self.num_depth_cls = depth_bins
+        super().__init__(
+            pred_bbox2d=pred_bbox2d, bbox_coder=bbox_coder, **kwargs)
+        self.loss_depth = build_loss(loss_depth)
+        if self.pred_bbox2d:
+            self.loss_bbox2d = build_loss(loss_bbox2d)
+            self.loss_consistency = build_loss(loss_consistency)
+        if self.pred_keypoints:
+            self.kpts_start = 9 if self.pred_velo else 7
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        super()._init_layers()
+        if self.pred_bbox2d:
+            self.scale_dim += 1
+        if self.pred_keypoints:
+            self.scale_dim += 1
+        self.scales = nn.ModuleList([
+            nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)])
+            for _ in self.strides
+        ])
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        super()._init_predictor()
+
+        if self.use_depth_classifier:
+            self.conv_depth_cls_prev = self._init_branch(
+                conv_channels=self.depth_branch,
+                conv_strides=(1, ) * len(self.depth_branch))
+            self.conv_depth_cls = nn.Conv2d(self.depth_branch[-1],
+                                            self.num_depth_cls, 1)
+            # Data-agnostic single param lambda for local depth fusion
+            self.fuse_lambda = nn.Parameter(torch.tensor(10e-5))
+
+        if self.weight_dim != -1:
+            self.conv_weight_prevs = nn.ModuleList()
+            self.conv_weights = nn.ModuleList()
+            for i in range(self.weight_dim):
+                weight_branch_channels = self.weight_branch[i]
+                weight_out_channel = self.weight_out_channels[i]
+                if len(weight_branch_channels) > 0:
+                    self.conv_weight_prevs.append(
+                        self._init_branch(
+                            conv_channels=weight_branch_channels,
+                            conv_strides=(1, ) * len(weight_branch_channels)))
+                    self.conv_weights.append(
+                        nn.Conv2d(weight_out_channel, 1, 1))
+                else:
+                    self.conv_weight_prevs.append(None)
+                    self.conv_weights.append(
+                        nn.Conv2d(self.feat_channels, 1, 1))
+
+    def init_weights(self):
+        """Initialize weights of the head.
+
+        We currently still use the customized defined init_weights because the
+        default init of DCN triggered by the init_cfg will init
+        conv_offset.weight, which mistakenly affects the training stability.
+        """
+        super().init_weights()
+
+        bias_cls = bias_init_with_prob(0.01)
+        if self.use_depth_classifier:
+            for m in self.conv_depth_cls_prev:
+                if isinstance(m.conv, nn.Conv2d):
+                    normal_init(m.conv, std=0.01)
+            normal_init(self.conv_depth_cls, std=0.01, bias=bias_cls)
+
+        if self.weight_dim != -1:
+            for conv_weight_prev in self.conv_weight_prevs:
+                if conv_weight_prev is None:
+                    continue
+                for m in conv_weight_prev:
+                    if isinstance(m.conv, nn.Conv2d):
+                        normal_init(m.conv, std=0.01)
+            for conv_weight in self.conv_weights:
+                normal_init(conv_weight, std=0.01)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+                dir_cls_preds (list[Tensor]): Box scores for direction class
+                    predictions on each scale level, each is a 4D-tensor,
+                    the channel number is num_points * 2. (bin = 2).
+                weight (list[Tensor]): Location-aware weight maps on each
+                    scale level, each is a 4D-tensor, the channel number is
+                    num_points * 1.
+                depth_cls_preds (list[Tensor]): Box scores for depth class
+                    predictions on each scale level, each is a 4D-tensor,
+                    the channel number is num_points * self.num_depth_cls.
+                attr_preds (list[Tensor]): Attribute scores for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * num_attrs.
+                centernesses (list[Tensor]): Centerness for each scale level,
+                    each is a 4D-tensor, the channel number is num_points * 1.
+        """
+        return multi_apply(self.forward_single, feats, self.scales,
+                           self.strides)
+
+    def forward_single(self, x, scale, stride):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+            scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
+                the bbox prediction.
+            stride (int): The corresponding stride for feature maps, only
+                used to normalize the bbox prediction when self.norm_on_bbox
+                is True.
+
+        Returns:
+            tuple: scores for each class, bbox and direction class
+                predictions, depth class predictions, location-aware weights,
+                attribute and centerness predictions of input feature maps.
+        """
+        cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, cls_feat, \
+            reg_feat = super().forward_single(x, scale, stride)
+
+        max_regress_range = stride * self.regress_ranges[0][1] / \
+            self.strides[0]
+        bbox_pred = self.bbox_coder.decode_2d(bbox_pred, scale, stride,
+                                              max_regress_range, self.training,
+                                              self.pred_keypoints,
+                                              self.pred_bbox2d)
+
+        depth_cls_pred = None
+        if self.use_depth_classifier:
+            clone_reg_feat = reg_feat.clone()
+            for conv_depth_cls_prev_layer in self.conv_depth_cls_prev:
+                clone_reg_feat = conv_depth_cls_prev_layer(clone_reg_feat)
+            depth_cls_pred = self.conv_depth_cls(clone_reg_feat)
+
+        weight = None
+        if self.weight_dim != -1:
+            weight = []
+            for i in range(self.weight_dim):
+                clone_reg_feat = reg_feat.clone()
+                if len(self.weight_branch[i]) > 0:
+                    for conv_weight_prev_layer in self.conv_weight_prevs[i]:
+                        clone_reg_feat = conv_weight_prev_layer(clone_reg_feat)
+                weight.append(self.conv_weights[i](clone_reg_feat))
+            weight = torch.cat(weight, dim=1)
+
+        return cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \
+            attr_pred, centerness
+
+    def get_proj_bbox2d(self,
+                        bbox_preds,
+                        pos_dir_cls_preds,
+                        labels_3d,
+                        bbox_targets_3d,
+                        pos_points,
+                        pos_inds,
+                        img_metas,
+                        pos_depth_cls_preds=None,
+                        pos_weights=None,
+                        pos_cls_scores=None,
+                        with_kpts=False):
+        """Decode box predictions and get projected 2D attributes.
+
+        Args:
+            bbox_preds (list[Tensor]): Box predictions for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * bbox_code_size.
+            pos_dir_cls_preds (Tensor): Box scores for direction class
+                predictions of positive boxes on all the scale levels in shape
+                (num_pos_points, 2).
+            labels_3d (list[Tensor]): 3D box category labels for each scale
+                level, each is a 4D-tensor.
+            bbox_targets_3d (list[Tensor]): 3D box targets for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * bbox_code_size.
+            pos_points (Tensor): Foreground points.
+            pos_inds (Tensor): Index of foreground points from flattened
+                tensors.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            pos_depth_cls_preds (Tensor, optional): Probabilistic depth map of
+                positive boxes on all the scale levels in shape
+                (num_pos_points, self.num_depth_cls). Defaults to None.
+            pos_weights (Tensor, optional): Location-aware weights of positive
+                boxes in shape (num_pos_points, self.weight_dim). Defaults to
+                None.
+            pos_cls_scores (Tensor, optional): Classification scores of
+                positive boxes in shape (num_pos_points, self.num_classes).
+                Defaults to None.
+            with_kpts (bool, optional): Whether to output keypoints targets.
+                Defaults to False.
+
+        Returns:
+            tuple[Tensor]: Exterior 2D boxes from projected 3D boxes,
+                predicted 2D boxes and keypoint targets (if necessary).
+        """
+        views = [np.array(img_meta['cam2img']) for img_meta in img_metas]
+        num_imgs = len(img_metas)
+        img_idx = []
+        for label in labels_3d:
+            for idx in range(num_imgs):
+                img_idx.append(
+                    labels_3d[0].new_ones(int(len(label) / num_imgs)) * idx)
+        img_idx = torch.cat(img_idx)
+        pos_img_idx = img_idx[pos_inds]
+
+        flatten_strided_bbox_preds = []
+        flatten_strided_bbox2d_preds = []
+        flatten_bbox_targets_3d = []
+        flatten_strides = []
+
+        for stride_idx, bbox_pred in enumerate(bbox_preds):
+            flatten_bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(
+                -1, sum(self.group_reg_dims))
+            flatten_bbox_pred[:, :2] *= self.strides[stride_idx]
+            flatten_bbox_pred[:, -4:] *= self.strides[stride_idx]
+            flatten_strided_bbox_preds.append(
+                flatten_bbox_pred[:, :self.bbox_coder.bbox_code_size])
+            flatten_strided_bbox2d_preds.append(flatten_bbox_pred[:, -4:])
+
+            bbox_target_3d = bbox_targets_3d[stride_idx].clone()
+            bbox_target_3d[:, :2] *= self.strides[stride_idx]
+            bbox_target_3d[:, -4:] *= self.strides[stride_idx]
+            flatten_bbox_targets_3d.append(bbox_target_3d)
+
+            flatten_stride = flatten_bbox_pred.new_ones(
+                *flatten_bbox_pred.shape[:-1], 1) * self.strides[stride_idx]
+            flatten_strides.append(flatten_stride)
+
+        flatten_strided_bbox_preds = torch.cat(flatten_strided_bbox_preds)
+        flatten_strided_bbox2d_preds = torch.cat(flatten_strided_bbox2d_preds)
+        flatten_bbox_targets_3d = torch.cat(flatten_bbox_targets_3d)
+        flatten_strides = torch.cat(flatten_strides)
+        pos_strided_bbox_preds = flatten_strided_bbox_preds[pos_inds]
+        pos_strided_bbox2d_preds = flatten_strided_bbox2d_preds[pos_inds]
+        pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
+        pos_strides = flatten_strides[pos_inds]
+
+        pos_decoded_bbox2d_preds = distance2bbox(pos_points,
+                                                 pos_strided_bbox2d_preds)
+
+        pos_strided_bbox_preds[:, :2] = \
+            pos_points - pos_strided_bbox_preds[:, :2]
+        pos_bbox_targets_3d[:, :2] = \
+            pos_points - pos_bbox_targets_3d[:, :2]
+
+        if self.use_depth_classifier and (not self.use_onlyreg_proj):
+            pos_prob_depth_preds = self.bbox_coder.decode_prob_depth(
+                pos_depth_cls_preds, self.depth_range, self.depth_unit,
+                self.division, self.num_depth_cls)
+            sig_alpha = torch.sigmoid(self.fuse_lambda)
+            pos_strided_bbox_preds[:, 2] = \
+                sig_alpha * pos_strided_bbox_preds.clone()[:, 2] + \
+                (1 - sig_alpha) * pos_prob_depth_preds
+
+        box_corners_in_image = pos_strided_bbox_preds.new_zeros(
+            (*pos_strided_bbox_preds.shape[:-1], 8, 2))
+        box_corners_in_image_gt = pos_strided_bbox_preds.new_zeros(
+            (*pos_strided_bbox_preds.shape[:-1], 8, 2))
+
+        for idx in range(num_imgs):
+            mask = (pos_img_idx == idx)
+            if pos_strided_bbox_preds[mask].shape[0] == 0:
+                continue
+            cam2img = torch.eye(
+                4,
+                dtype=pos_strided_bbox_preds.dtype,
+                device=pos_strided_bbox_preds.device)
+            view_shape = views[idx].shape
+            cam2img[:view_shape[0], :view_shape[1]] = \
+                pos_strided_bbox_preds.new_tensor(views[idx])
+
+            centers2d_preds = pos_strided_bbox_preds.clone()[mask, :2]
+            centers2d_targets = pos_bbox_targets_3d.clone()[mask, :2]
+            centers3d_targets = points_img2cam(pos_bbox_targets_3d[mask, :3],
+                                               views[idx])
+
+            # use predicted depth to re-project the 2.5D centers
+            pos_strided_bbox_preds[mask, :3] = points_img2cam(
+                pos_strided_bbox_preds[mask, :3], views[idx])
+            pos_bbox_targets_3d[mask, :3] = centers3d_targets
+
+            # depth fixed when computing re-project 3D bboxes
+            pos_strided_bbox_preds[mask, 2] = \
+                pos_bbox_targets_3d.clone()[mask, 2]
+
+            # decode yaws
+            if self.use_direction_classifier:
+                pos_dir_cls_scores = torch.max(
+                    pos_dir_cls_preds[mask], dim=-1)[1]
+                pos_strided_bbox_preds[mask] = self.bbox_coder.decode_yaw(
+                    pos_strided_bbox_preds[mask], centers2d_preds,
+                    pos_dir_cls_scores, self.dir_offset, cam2img)
+            pos_bbox_targets_3d[mask, 6] = torch.atan2(
+                centers2d_targets[:, 0] - cam2img[0, 2],
+                cam2img[0, 0]) + pos_bbox_targets_3d[mask, 6]
+
+            corners = img_metas[0]['box_type_3d'](
+                pos_strided_bbox_preds[mask],
+                box_dim=self.bbox_coder.bbox_code_size,
+                origin=(0.5, 0.5, 0.5)).corners
+            box_corners_in_image[mask] = points_cam2img(corners, cam2img)
+
+            corners_gt = img_metas[0]['box_type_3d'](
+                pos_bbox_targets_3d[mask, :self.bbox_code_size],
+                box_dim=self.bbox_coder.bbox_code_size,
+                origin=(0.5, 0.5, 0.5)).corners
+            box_corners_in_image_gt[mask] = points_cam2img(corners_gt, cam2img)
+
+        minxy = torch.min(box_corners_in_image, dim=1)[0]
+        maxxy = torch.max(box_corners_in_image, dim=1)[0]
+        proj_bbox2d_preds = torch.cat([minxy, maxxy], dim=1)
+
+        outputs = (proj_bbox2d_preds, pos_decoded_bbox2d_preds)
+
+        if with_kpts:
+            norm_strides = pos_strides * self.regress_ranges[0][1] / \
+                self.strides[0]
+            kpts_targets = box_corners_in_image_gt - pos_points[..., None, :]
+            kpts_targets = kpts_targets.view(
+                (*pos_strided_bbox_preds.shape[:-1], 16))
+            kpts_targets /= norm_strides
+
+            outputs += (kpts_targets, )
+
+        return outputs
+
+    def get_pos_predictions(self, bbox_preds, dir_cls_preds, depth_cls_preds,
+                            weights, attr_preds, centernesses, pos_inds,
+                            img_metas):
+        """Flatten predictions and get positive ones.
+
+        Args:
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * bbox_code_size.
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            depth_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * self.num_depth_cls.
+            attr_preds (list[Tensor]): Attribute scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_attrs.
+            centernesses (list[Tensor]): Centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            pos_inds (Tensor): Index of foreground points from flattened
+                tensors.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple[Tensor]: Box predictions, direction classes, probabilistic
+                depth maps, location-aware weight maps, attributes and
+                centerness predictions.
+        """
+        flatten_bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims))
+            for bbox_pred in bbox_preds
+        ]
+        flatten_dir_cls_preds = [
+            dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2)
+            for dir_cls_pred in dir_cls_preds
+        ]
+        flatten_centerness = [
+            centerness.permute(0, 2, 3, 1).reshape(-1)
+            for centerness in centernesses
+        ]
+        flatten_bbox_preds = torch.cat(flatten_bbox_preds)
+        flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds)
+        flatten_centerness = torch.cat(flatten_centerness)
+        pos_bbox_preds = flatten_bbox_preds[pos_inds]
+        pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds]
+        pos_centerness = flatten_centerness[pos_inds]
+
+        pos_depth_cls_preds = None
+        if self.use_depth_classifier:
+            flatten_depth_cls_preds = [
+                depth_cls_pred.permute(0, 2, 3,
+                                       1).reshape(-1, self.num_depth_cls)
+                for depth_cls_pred in depth_cls_preds
+            ]
+            flatten_depth_cls_preds = torch.cat(flatten_depth_cls_preds)
+            pos_depth_cls_preds = flatten_depth_cls_preds[pos_inds]
+
+        pos_weights = None
+        if self.weight_dim != -1:
+            flatten_weights = [
+                weight.permute(0, 2, 3, 1).reshape(-1, self.weight_dim)
+                for weight in weights
+            ]
+            flatten_weights = torch.cat(flatten_weights)
+            pos_weights = flatten_weights[pos_inds]
+
+        pos_attr_preds = None
+        if self.pred_attrs:
+            flatten_attr_preds = [
+                attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs)
+                for attr_pred in attr_preds
+            ]
+            flatten_attr_preds = torch.cat(flatten_attr_preds)
+            pos_attr_preds = flatten_attr_preds[pos_inds]
+
+        return pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, \
+            pos_weights, pos_attr_preds, pos_centerness
+
+    @force_fp32(
+        apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds',
+                  'depth_cls_preds', 'weights', 'attr_preds', 'centernesses'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             dir_cls_preds,
+             depth_cls_preds,
+             weights,
+             attr_preds,
+             centernesses,
+             gt_bboxes,
+             gt_labels,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             centers2d,
+             depths,
+             attr_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * bbox_code_size.
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            depth_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * self.num_depth_cls.
+            weights (list[Tensor]): Location-aware weights for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * self.weight_dim.
+            attr_preds (list[Tensor]): Attribute scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_attrs.
+            centernesses (list[Tensor]): Centerness for each scale level, each
+                is a 4D-tensor, the channel number is num_points * 1.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            gt_bboxes_3d (list[Tensor]): 3D boxes ground truth with shape of
+                (num_gts, code_size).
+            gt_labels_3d (list[Tensor]): same as gt_labels
+            centers2d (list[Tensor]): 2D centers on the image with shape of
+                (num_gts, 2).
+            depths (list[Tensor]): Depth ground truth with shape of
+                (num_gts, ).
+            attr_labels (list[Tensor]): Attributes indices of each box.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can
+                be ignored when computing the loss. Defaults to None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
+            len(depth_cls_preds) == len(weights) == len(centernesses) == \
+            len(attr_preds), 'The length of cls_scores, bbox_preds, ' \
+            'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \
+            f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \
+            f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \
+            f'{len(centernesses)}, {len(attr_preds)} are inconsistent.'
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                           bbox_preds[0].device)
+        labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \
+            self.get_targets(
+                all_level_points, gt_bboxes, gt_labels, gt_bboxes_3d,
+                gt_labels_3d, centers2d, depths, attr_labels)
+
+        num_imgs = cls_scores[0].size(0)
+        # flatten cls_scores and targets
+        flatten_cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels)
+            for cls_score in cls_scores
+        ]
+        flatten_cls_scores = torch.cat(flatten_cls_scores)
+        flatten_labels_3d = torch.cat(labels_3d)
+        flatten_bbox_targets_3d = torch.cat(bbox_targets_3d)
+        flatten_centerness_targets = torch.cat(centerness_targets)
+        flatten_points = torch.cat(
+            [points.repeat(num_imgs, 1) for points in all_level_points])
+        if self.pred_attrs:
+            flatten_attr_targets = torch.cat(attr_targets)
+
+        # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+        bg_class_ind = self.num_classes
+        pos_inds = ((flatten_labels_3d >= 0)
+                    & (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1)
+        num_pos = len(pos_inds)
+
+        loss_dict = dict()
+
+        loss_dict['loss_cls'] = self.loss_cls(
+            flatten_cls_scores,
+            flatten_labels_3d,
+            avg_factor=num_pos + num_imgs)  # avoid num_pos is 0
+
+        pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, pos_weights, \
+            pos_attr_preds, pos_centerness = self.get_pos_predictions(
+                bbox_preds, dir_cls_preds, depth_cls_preds, weights,
+                attr_preds, centernesses, pos_inds, img_metas)
+
+        if num_pos > 0:
+            pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds]
+            pos_centerness_targets = flatten_centerness_targets[pos_inds]
+            pos_points = flatten_points[pos_inds]
+            if self.pred_attrs:
+                pos_attr_targets = flatten_attr_targets[pos_inds]
+            if self.use_direction_classifier:
+                pos_dir_cls_targets = self.get_direction_target(
+                    pos_bbox_targets_3d, self.dir_offset, one_hot=False)
+
+            bbox_weights = pos_centerness_targets.new_ones(
+                len(pos_centerness_targets), sum(self.group_reg_dims))
+            equal_weights = pos_centerness_targets.new_ones(
+                pos_centerness_targets.shape)
+            code_weight = self.train_cfg.get('code_weight', None)
+            if code_weight:
+                assert len(code_weight) == sum(self.group_reg_dims)
+                bbox_weights = bbox_weights * bbox_weights.new_tensor(
+                    code_weight)
+
+            if self.diff_rad_by_sin:
+                pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference(
+                    pos_bbox_preds, pos_bbox_targets_3d)
+
+            loss_dict['loss_offset'] = self.loss_bbox(
+                pos_bbox_preds[:, :2],
+                pos_bbox_targets_3d[:, :2],
+                weight=bbox_weights[:, :2],
+                avg_factor=equal_weights.sum())
+            loss_dict['loss_size'] = self.loss_bbox(
+                pos_bbox_preds[:, 3:6],
+                pos_bbox_targets_3d[:, 3:6],
+                weight=bbox_weights[:, 3:6],
+                avg_factor=equal_weights.sum())
+            loss_dict['loss_rotsin'] = self.loss_bbox(
+                pos_bbox_preds[:, 6],
+                pos_bbox_targets_3d[:, 6],
+                weight=bbox_weights[:, 6],
+                avg_factor=equal_weights.sum())
+            if self.pred_velo:
+                loss_dict['loss_velo'] = self.loss_bbox(
+                    pos_bbox_preds[:, 7:9],
+                    pos_bbox_targets_3d[:, 7:9],
+                    weight=bbox_weights[:, 7:9],
+                    avg_factor=equal_weights.sum())
+
+            proj_bbox2d_inputs = (bbox_preds, pos_dir_cls_preds, labels_3d,
+                                  bbox_targets_3d, pos_points, pos_inds,
+                                  img_metas)
+
+            # direction classification loss
+            # TODO: add more check for use_direction_classifier
+            if self.use_direction_classifier:
+                loss_dict['loss_dir'] = self.loss_dir(
+                    pos_dir_cls_preds,
+                    pos_dir_cls_targets,
+                    equal_weights,
+                    avg_factor=equal_weights.sum())
+
+            # init depth loss with the one computed from direct regression
+            loss_dict['loss_depth'] = self.loss_bbox(
+                pos_bbox_preds[:, 2],
+                pos_bbox_targets_3d[:, 2],
+                weight=bbox_weights[:, 2],
+                avg_factor=equal_weights.sum())
+            # depth classification loss
+            if self.use_depth_classifier:
+                pos_prob_depth_preds = self.bbox_coder.decode_prob_depth(
+                    pos_depth_cls_preds, self.depth_range, self.depth_unit,
+                    self.division, self.num_depth_cls)
+                sig_alpha = torch.sigmoid(self.fuse_lambda)
+                if self.weight_dim != -1:
+                    loss_fuse_depth = self.loss_depth(
+                        sig_alpha * pos_bbox_preds[:, 2] +
+                        (1 - sig_alpha) * pos_prob_depth_preds,
+                        pos_bbox_targets_3d[:, 2],
+                        sigma=pos_weights[:, 0],
+                        weight=bbox_weights[:, 2],
+                        avg_factor=equal_weights.sum())
+                else:
+                    loss_fuse_depth = self.loss_depth(
+                        sig_alpha * pos_bbox_preds[:, 2] +
+                        (1 - sig_alpha) * pos_prob_depth_preds,
+                        pos_bbox_targets_3d[:, 2],
+                        weight=bbox_weights[:, 2],
+                        avg_factor=equal_weights.sum())
+                loss_dict['loss_depth'] = loss_fuse_depth
+
+                proj_bbox2d_inputs += (pos_depth_cls_preds, )
+
+            if self.pred_keypoints:
+                # use smoothL1 to compute consistency loss for keypoints
+                # normalize the offsets with strides
+                proj_bbox2d_preds, pos_decoded_bbox2d_preds, kpts_targets = \
+                    self.get_proj_bbox2d(*proj_bbox2d_inputs, with_kpts=True)
+                loss_dict['loss_kpts'] = self.loss_bbox(
+                    pos_bbox_preds[:, self.kpts_start:self.kpts_start + 16],
+                    kpts_targets,
+                    weight=bbox_weights[:,
+                                        self.kpts_start:self.kpts_start + 16],
+                    avg_factor=equal_weights.sum())
+
+            if self.pred_bbox2d:
+                loss_dict['loss_bbox2d'] = self.loss_bbox2d(
+                    pos_bbox_preds[:, -4:],
+                    pos_bbox_targets_3d[:, -4:],
+                    weight=bbox_weights[:, -4:],
+                    avg_factor=equal_weights.sum())
+                if not self.pred_keypoints:
+                    proj_bbox2d_preds, pos_decoded_bbox2d_preds = \
+                        self.get_proj_bbox2d(*proj_bbox2d_inputs)
+                loss_dict['loss_consistency'] = self.loss_consistency(
+                    proj_bbox2d_preds,
+                    pos_decoded_bbox2d_preds,
+                    weight=bbox_weights[:, -4:],
+                    avg_factor=equal_weights.sum())
+
+            loss_dict['loss_centerness'] = self.loss_centerness(
+                pos_centerness, pos_centerness_targets)
+
+            # attribute classification loss
+            if self.pred_attrs:
+                loss_dict['loss_attr'] = self.loss_attr(
+                    pos_attr_preds,
+                    pos_attr_targets,
+                    pos_centerness_targets,
+                    avg_factor=pos_centerness_targets.sum())
+
+        else:
+            # need absolute due to possible negative delta x/y
+            loss_dict['loss_offset'] = pos_bbox_preds[:, :2].sum()
+            loss_dict['loss_size'] = pos_bbox_preds[:, 3:6].sum()
+            loss_dict['loss_rotsin'] = pos_bbox_preds[:, 6].sum()
+            loss_dict['loss_depth'] = pos_bbox_preds[:, 2].sum()
+            if self.pred_velo:
+                loss_dict['loss_velo'] = pos_bbox_preds[:, 7:9].sum()
+            if self.pred_keypoints:
+                loss_dict['loss_kpts'] = pos_bbox_preds[:,
+                                                        self.kpts_start:self.
+                                                        kpts_start + 16].sum()
+            if self.pred_bbox2d:
+                loss_dict['loss_bbox2d'] = pos_bbox_preds[:, -4:].sum()
+                loss_dict['loss_consistency'] = pos_bbox_preds[:, -4:].sum()
+            loss_dict['loss_centerness'] = pos_centerness.sum()
+            if self.use_direction_classifier:
+                loss_dict['loss_dir'] = pos_dir_cls_preds.sum()
+            if self.use_depth_classifier:
+                sig_alpha = torch.sigmoid(self.fuse_lambda)
+                loss_fuse_depth = \
+                    sig_alpha * pos_bbox_preds[:, 2].sum() + \
+                    (1 - sig_alpha) * pos_depth_cls_preds.sum()
+                if self.weight_dim != -1:
+                    loss_fuse_depth *= torch.exp(-pos_weights[:, 0].sum())
+                loss_dict['loss_depth'] = loss_fuse_depth
+            if self.pred_attrs:
+                loss_dict['loss_attr'] = pos_attr_preds.sum()
+
+        return loss_dict
+
+    @force_fp32(
+        apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds',
+                  'depth_cls_preds', 'weights', 'attr_preds', 'centernesses'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   dir_cls_preds,
+                   depth_cls_preds,
+                   weights,
+                   attr_preds,
+                   centernesses,
+                   img_metas,
+                   cfg=None,
+                   rescale=None):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W)
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * 2. (bin = 2)
+            depth_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on each scale level, each is a 4D-tensor,
+                the channel number is num_points * self.num_depth_cls.
+            weights (list[Tensor]): Location-aware weights for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * self.weight_dim.
+            attr_preds (list[Tensor]): Attribute scores for each scale level
+                Has shape (N, num_points * num_attrs, H, W)
+            centernesses (list[Tensor]): Centerness for each scale level with
+                shape (N, num_points * 1, H, W)
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config, optional): Test / postprocessing configuration,
+                if None, test_cfg would be used. Defaults to None.
+            rescale (bool, optional): If True, return boxes in original image
+                space. Defaults to None.
+
+        Returns:
+            list[tuple[Tensor]]: Each item in result_list is a tuple, which
+                consists of predicted 3D boxes, scores, labels, attributes and
+                2D boxes (if necessary).
+        """
+        assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \
+            len(depth_cls_preds) == len(weights) == len(centernesses) == \
+            len(attr_preds), 'The length of cls_scores, bbox_preds, ' \
+            'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \
+            f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \
+            f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \
+            f'{len(centernesses)}, {len(attr_preds)} are inconsistent.'
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype,
+                                      bbox_preds[0].device)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            if self.use_direction_classifier:
+                dir_cls_pred_list = [
+                    dir_cls_preds[i][img_id].detach()
+                    for i in range(num_levels)
+                ]
+            else:
+                dir_cls_pred_list = [
+                    cls_scores[i][img_id].new_full(
+                        [2, *cls_scores[i][img_id].shape[1:]], 0).detach()
+                    for i in range(num_levels)
+                ]
+            if self.use_depth_classifier:
+                depth_cls_pred_list = [
+                    depth_cls_preds[i][img_id].detach()
+                    for i in range(num_levels)
+                ]
+            else:
+                depth_cls_pred_list = [
+                    cls_scores[i][img_id].new_full(
+                        [self.num_depth_cls, *cls_scores[i][img_id].shape[1:]],
+                        0).detach() for i in range(num_levels)
+                ]
+            if self.weight_dim != -1:
+                weight_list = [
+                    weights[i][img_id].detach() for i in range(num_levels)
+                ]
+            else:
+                weight_list = [
+                    cls_scores[i][img_id].new_full(
+                        [1, *cls_scores[i][img_id].shape[1:]], 0).detach()
+                    for i in range(num_levels)
+                ]
+            if self.pred_attrs:
+                attr_pred_list = [
+                    attr_preds[i][img_id].detach() for i in range(num_levels)
+                ]
+            else:
+                attr_pred_list = [
+                    cls_scores[i][img_id].new_full(
+                        [self.num_attrs, *cls_scores[i][img_id].shape[1:]],
+                        self.attr_background_label).detach()
+                    for i in range(num_levels)
+                ]
+            centerness_pred_list = [
+                centernesses[i][img_id].detach() for i in range(num_levels)
+            ]
+            input_meta = img_metas[img_id]
+            det_bboxes = self._get_bboxes_single(
+                cls_score_list, bbox_pred_list, dir_cls_pred_list,
+                depth_cls_pred_list, weight_list, attr_pred_list,
+                centerness_pred_list, mlvl_points, input_meta, cfg, rescale)
+            result_list.append(det_bboxes)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           dir_cls_preds,
+                           depth_cls_preds,
+                           weights,
+                           attr_preds,
+                           centernesses,
+                           mlvl_points,
+                           input_meta,
+                           cfg,
+                           rescale=False):
+        """Transform outputs for a single batch item into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for a single scale level
+                Has shape (num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for a single scale
+                level with shape (num_points * bbox_code_size, H, W).
+            dir_cls_preds (list[Tensor]): Box scores for direction class
+                predictions on a single scale level with shape
+                (num_points * 2, H, W)
+            depth_cls_preds (list[Tensor]): Box scores for probabilistic depth
+                predictions on a single scale level with shape
+                (num_points * self.num_depth_cls, H, W)
+            weights (list[Tensor]): Location-aware weight maps on a single
+                scale level with shape (num_points * self.weight_dim, H, W).
+            attr_preds (list[Tensor]): Attribute scores for each scale level
+                Has shape (N, num_points * num_attrs, H, W)
+            centernesses (list[Tensor]): Centerness for a single scale level
+                with shape (num_points, H, W).
+            mlvl_points (list[Tensor]): Box reference for a single scale level
+                with shape (num_total_points, 2).
+            input_meta (dict): Metadata of input image.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool, optional): If True, return boxes in original image
+                space. Defaults to False.
+
+        Returns:
+            tuples[Tensor]: Predicted 3D boxes, scores, labels, attributes and
+                2D boxes (if necessary).
+        """
+        view = np.array(input_meta['cam2img'])
+        scale_factor = input_meta['scale_factor']
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_points)
+        mlvl_centers2d = []
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        mlvl_attr_scores = []
+        mlvl_centerness = []
+        mlvl_depth_cls_scores = []
+        mlvl_depth_uncertainty = []
+        mlvl_bboxes2d = None
+        if self.pred_bbox2d:
+            mlvl_bboxes2d = []
+
+        for cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \
+                attr_pred, centerness, points in zip(
+                    cls_scores, bbox_preds, dir_cls_preds, depth_cls_preds,
+                    weights, attr_preds, centernesses, mlvl_points):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            scores = cls_score.permute(1, 2, 0).reshape(
+                -1, self.cls_out_channels).sigmoid()
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+            depth_cls_pred = depth_cls_pred.permute(1, 2, 0).reshape(
+                -1, self.num_depth_cls)
+            depth_cls_score = F.softmax(
+                depth_cls_pred, dim=-1).topk(
+                    k=2, dim=-1)[0].mean(dim=-1)
+            if self.weight_dim != -1:
+                weight = weight.permute(1, 2, 0).reshape(-1, self.weight_dim)
+            else:
+                weight = weight.permute(1, 2, 0).reshape(-1, 1)
+            depth_uncertainty = torch.exp(-weight[:, -1])
+            attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs)
+            attr_score = torch.max(attr_pred, dim=-1)[1]
+            centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid()
+
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1,
+                                                     sum(self.group_reg_dims))
+            bbox_pred3d = bbox_pred[:, :self.bbox_coder.bbox_code_size]
+            if self.pred_bbox2d:
+                bbox_pred2d = bbox_pred[:, -4:]
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                merged_scores = scores * centerness[:, None]
+                if self.use_depth_classifier:
+                    merged_scores *= depth_cls_score[:, None]
+                    if self.weight_dim != -1:
+                        merged_scores *= depth_uncertainty[:, None]
+                max_scores, _ = merged_scores.max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                points = points[topk_inds, :]
+                bbox_pred3d = bbox_pred3d[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_pred = dir_cls_pred[topk_inds, :]
+                depth_cls_pred = depth_cls_pred[topk_inds, :]
+                centerness = centerness[topk_inds]
+                dir_cls_score = dir_cls_score[topk_inds]
+                depth_cls_score = depth_cls_score[topk_inds]
+                depth_uncertainty = depth_uncertainty[topk_inds]
+                attr_score = attr_score[topk_inds]
+                if self.pred_bbox2d:
+                    bbox_pred2d = bbox_pred2d[topk_inds, :]
+            # change the offset to actual center predictions
+            bbox_pred3d[:, :2] = points - bbox_pred3d[:, :2]
+            if rescale:
+                bbox_pred3d[:, :2] /= bbox_pred3d[:, :2].new_tensor(
+                    scale_factor)
+                if self.pred_bbox2d:
+                    bbox_pred2d /= bbox_pred2d.new_tensor(scale_factor)
+            if self.use_depth_classifier:
+                prob_depth_pred = self.bbox_coder.decode_prob_depth(
+                    depth_cls_pred, self.depth_range, self.depth_unit,
+                    self.division, self.num_depth_cls)
+                sig_alpha = torch.sigmoid(self.fuse_lambda)
+                bbox_pred3d[:, 2] = sig_alpha * bbox_pred3d[:, 2] + \
+                    (1 - sig_alpha) * prob_depth_pred
+            pred_center2d = bbox_pred3d[:, :3].clone()
+            bbox_pred3d[:, :3] = points_img2cam(bbox_pred3d[:, :3], view)
+            mlvl_centers2d.append(pred_center2d)
+            mlvl_bboxes.append(bbox_pred3d)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+            mlvl_depth_cls_scores.append(depth_cls_score)
+            mlvl_attr_scores.append(attr_score)
+            mlvl_centerness.append(centerness)
+            mlvl_depth_uncertainty.append(depth_uncertainty)
+            if self.pred_bbox2d:
+                bbox_pred2d = distance2bbox(
+                    points, bbox_pred2d, max_shape=input_meta['img_shape'])
+                mlvl_bboxes2d.append(bbox_pred2d)
+
+        mlvl_centers2d = torch.cat(mlvl_centers2d)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+        if self.pred_bbox2d:
+            mlvl_bboxes2d = torch.cat(mlvl_bboxes2d)
+
+        # change local yaw to global yaw for 3D nms
+        cam2img = torch.eye(
+            4, dtype=mlvl_centers2d.dtype, device=mlvl_centers2d.device)
+        cam2img[:view.shape[0], :view.shape[1]] = \
+            mlvl_centers2d.new_tensor(view)
+        mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d,
+                                                 mlvl_dir_scores,
+                                                 self.dir_offset, cam2img)
+
+        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            mlvl_bboxes,
+            box_dim=self.bbox_coder.bbox_code_size,
+            origin=(0.5, 0.5, 0.5)).bev)
+
+        mlvl_scores = torch.cat(mlvl_scores)
+        padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+        # remind that we set FG labels to [0, num_class-1] since mmdet v2.0
+        # BG cat_id: num_class
+        mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        mlvl_attr_scores = torch.cat(mlvl_attr_scores)
+        mlvl_centerness = torch.cat(mlvl_centerness)
+        # no scale_factors in box3d_multiclass_nms
+        # Then we multiply it from outside
+        mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None]
+        if self.use_depth_classifier:  # multiply the depth confidence
+            mlvl_depth_cls_scores = torch.cat(mlvl_depth_cls_scores)
+            mlvl_nms_scores *= mlvl_depth_cls_scores[:, None]
+            if self.weight_dim != -1:
+                mlvl_depth_uncertainty = torch.cat(mlvl_depth_uncertainty)
+                mlvl_nms_scores *= mlvl_depth_uncertainty[:, None]
+        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                       mlvl_nms_scores, cfg.score_thr,
+                                       cfg.max_per_img, cfg, mlvl_dir_scores,
+                                       mlvl_attr_scores, mlvl_bboxes2d)
+        bboxes, scores, labels, dir_scores, attrs = results[0:5]
+        attrs = attrs.to(labels.dtype)  # change data type to int
+        bboxes = input_meta['box_type_3d'](
+            bboxes,
+            box_dim=self.bbox_coder.bbox_code_size,
+            origin=(0.5, 0.5, 0.5))
+        # Note that the predictions use origin (0.5, 0.5, 0.5)
+        # Due to the ground truth centers2d are the gravity center of objects
+        # v0.10.0 fix inplace operation to the input tensor of cam_box3d
+        # So here we also need to add origin=(0.5, 0.5, 0.5)
+        if not self.pred_attrs:
+            attrs = None
+
+        outputs = (bboxes, scores, labels, attrs)
+        if self.pred_bbox2d:
+            bboxes2d = results[-1]
+            bboxes2d = torch.cat([bboxes2d, scores[:, None]], dim=1)
+            outputs = outputs + (bboxes2d, )
+
+        return outputs
+
+    def get_targets(self, points, gt_bboxes_list, gt_labels_list,
+                    gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list,
+                    depths_list, attr_labels_list):
+        """Compute regression, classification and centerss targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+            gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
+                image, each has shape (num_gt, bbox_code_size).
+            gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
+                box, each has shape (num_gt,).
+            centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
+                each has shape (num_gt, 2).
+            depths_list (list[Tensor]): Depth of projected 3D centers onto 2D
+                image, each has shape (num_gt, 1).
+            attr_labels_list (list[Tensor]): Attribute labels of each box,
+                each has shape (num_gt,).
+
+        Returns:
+            tuple:
+                concat_lvl_labels (list[Tensor]): Labels of each level. \
+                concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \
+                    level.
+        """
+        assert len(points) == len(self.regress_ranges)
+        num_levels = len(points)
+        # expand regress ranges to align with points
+        expanded_regress_ranges = [
+            points[i].new_tensor(self.regress_ranges[i])[None].expand_as(
+                points[i]) for i in range(num_levels)
+        ]
+        # concat all levels points and regress ranges
+        concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0)
+        concat_points = torch.cat(points, dim=0)
+
+        # the number of points per img, per lvl
+        num_points = [center.size(0) for center in points]
+
+        if attr_labels_list is None:
+            attr_labels_list = [
+                gt_labels.new_full(gt_labels.shape, self.attr_background_label)
+                for gt_labels in gt_labels_list
+            ]
+
+        # get labels and bbox_targets of each image
+        _, bbox_targets_list, labels_3d_list, bbox_targets_3d_list, \
+            centerness_targets_list, attr_targets_list = multi_apply(
+                self._get_target_single,
+                gt_bboxes_list,
+                gt_labels_list,
+                gt_bboxes_3d_list,
+                gt_labels_3d_list,
+                centers2d_list,
+                depths_list,
+                attr_labels_list,
+                points=concat_points,
+                regress_ranges=concat_regress_ranges,
+                num_points_per_lvl=num_points)
+
+        # split to per img, per level
+        bbox_targets_list = [
+            bbox_targets.split(num_points, 0)
+            for bbox_targets in bbox_targets_list
+        ]
+        labels_3d_list = [
+            labels_3d.split(num_points, 0) for labels_3d in labels_3d_list
+        ]
+        bbox_targets_3d_list = [
+            bbox_targets_3d.split(num_points, 0)
+            for bbox_targets_3d in bbox_targets_3d_list
+        ]
+        centerness_targets_list = [
+            centerness_targets.split(num_points, 0)
+            for centerness_targets in centerness_targets_list
+        ]
+        attr_targets_list = [
+            attr_targets.split(num_points, 0)
+            for attr_targets in attr_targets_list
+        ]
+
+        # concat per level image
+        concat_lvl_labels_3d = []
+        concat_lvl_bbox_targets_3d = []
+        concat_lvl_centerness_targets = []
+        concat_lvl_attr_targets = []
+        for i in range(num_levels):
+            concat_lvl_labels_3d.append(
+                torch.cat([labels[i] for labels in labels_3d_list]))
+            concat_lvl_centerness_targets.append(
+                torch.cat([
+                    centerness_targets[i]
+                    for centerness_targets in centerness_targets_list
+                ]))
+            bbox_targets_3d = torch.cat([
+                bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list
+            ])
+            if self.pred_bbox2d:
+                bbox_targets = torch.cat(
+                    [bbox_targets[i] for bbox_targets in bbox_targets_list])
+                bbox_targets_3d = torch.cat([bbox_targets_3d, bbox_targets],
+                                            dim=1)
+            concat_lvl_attr_targets.append(
+                torch.cat(
+                    [attr_targets[i] for attr_targets in attr_targets_list]))
+            if self.norm_on_bbox:
+                bbox_targets_3d[:, :2] = \
+                    bbox_targets_3d[:, :2] / self.strides[i]
+                if self.pred_bbox2d:
+                    bbox_targets_3d[:, -4:] = \
+                        bbox_targets_3d[:, -4:] / self.strides[i]
+            concat_lvl_bbox_targets_3d.append(bbox_targets_3d)
+        return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \
+            concat_lvl_centerness_targets, concat_lvl_attr_targets
diff --git a/mmdet3d/models/dense_heads/point_rpn_head.py b/mmdet3d/models/dense_heads/point_rpn_head.py
index 546cf16..22dad4c 100644
--- a/mmdet3d/models/dense_heads/point_rpn_head.py
+++ b/mmdet3d/models/dense_heads/point_rpn_head.py
@@ -1,381 +1,381 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.runner import BaseModule, force_fp32
-from torch import nn as nn
-
-from mmdet3d.core import xywhr2xyxyr
-from mmdet3d.core.bbox.structures import (DepthInstance3DBoxes,
-                                          LiDARInstance3DBoxes)
-from mmdet3d.core.post_processing import nms_bev, nms_normal_bev
-from mmdet.core import build_bbox_coder, multi_apply
-from ..builder import HEADS, build_loss
-
-
-@HEADS.register_module()
-class PointRPNHead(BaseModule):
-    """RPN module for PointRCNN.
-
-    Args:
-        num_classes (int): Number of classes.
-        train_cfg (dict): Train configs.
-        test_cfg (dict): Test configs.
-        pred_layer_cfg (dict, optional): Config of classification and
-            regression prediction layers. Defaults to None.
-        enlarge_width (float, optional): Enlarge bbox for each side to ignore
-            close points. Defaults to 0.1.
-        cls_loss (dict, optional): Config of direction classification loss.
-            Defaults to None.
-        bbox_loss (dict, optional): Config of localization loss.
-            Defaults to None.
-        bbox_coder (dict, optional): Config dict of box coders.
-            Defaults to None.
-        init_cfg (dict, optional): Config of initialization. Defaults to None.
-    """
-
-    def __init__(self,
-                 num_classes,
-                 train_cfg,
-                 test_cfg,
-                 pred_layer_cfg=None,
-                 enlarge_width=0.1,
-                 cls_loss=None,
-                 bbox_loss=None,
-                 bbox_coder=None,
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.num_classes = num_classes
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        self.enlarge_width = enlarge_width
-
-        # build loss function
-        self.bbox_loss = build_loss(bbox_loss)
-        self.cls_loss = build_loss(cls_loss)
-
-        # build box coder
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-
-        # build pred conv
-        self.cls_layers = self._make_fc_layers(
-            fc_cfg=pred_layer_cfg.cls_linear_channels,
-            input_channels=pred_layer_cfg.in_channels,
-            output_channels=self._get_cls_out_channels())
-
-        self.reg_layers = self._make_fc_layers(
-            fc_cfg=pred_layer_cfg.reg_linear_channels,
-            input_channels=pred_layer_cfg.in_channels,
-            output_channels=self._get_reg_out_channels())
-
-    def _make_fc_layers(self, fc_cfg, input_channels, output_channels):
-        """Make fully connect layers.
-
-        Args:
-            fc_cfg (dict): Config of fully connect.
-            input_channels (int): Input channels for fc_layers.
-            output_channels (int): Input channels for fc_layers.
-
-        Returns:
-            nn.Sequential: Fully connect layers.
-        """
-        fc_layers = []
-        c_in = input_channels
-        for k in range(0, fc_cfg.__len__()):
-            fc_layers.extend([
-                nn.Linear(c_in, fc_cfg[k], bias=False),
-                nn.BatchNorm1d(fc_cfg[k]),
-                nn.ReLU(),
-            ])
-            c_in = fc_cfg[k]
-        fc_layers.append(nn.Linear(c_in, output_channels, bias=True))
-        return nn.Sequential(*fc_layers)
-
-    def _get_cls_out_channels(self):
-        """Return the channel number of classification outputs."""
-        # Class numbers (k) + objectness (1)
-        return self.num_classes
-
-    def _get_reg_out_channels(self):
-        """Return the channel number of regression outputs."""
-        # Bbox classification and regression
-        # (center residual (3), size regression (3)
-        # torch.cos(yaw) (1), torch.sin(yaw) (1)
-        return self.bbox_coder.code_size
-
-    def forward(self, feat_dict):
-        """Forward pass.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-
-        Returns:
-            tuple[list[torch.Tensor]]: Predicted boxes and classification
-                scores.
-        """
-        point_features = feat_dict['fp_features']
-        point_features = point_features.permute(0, 2, 1).contiguous()
-        batch_size = point_features.shape[0]
-        feat_cls = point_features.view(-1, point_features.shape[-1])
-        feat_reg = point_features.view(-1, point_features.shape[-1])
-
-        point_cls_preds = self.cls_layers(feat_cls).reshape(
-            batch_size, -1, self._get_cls_out_channels())
-        point_box_preds = self.reg_layers(feat_reg).reshape(
-            batch_size, -1, self._get_reg_out_channels())
-        return point_box_preds, point_cls_preds
-
-    @force_fp32(apply_to=('bbox_preds'))
-    def loss(self,
-             bbox_preds,
-             cls_preds,
-             points,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             img_metas=None):
-        """Compute loss.
-
-        Args:
-            bbox_preds (dict): Predictions from forward of PointRCNN RPN_Head.
-            cls_preds (dict): Classification from forward of PointRCNN
-                RPN_Head.
-            points (list[torch.Tensor]): Input points.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each sample.
-            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
-            img_metas (list[dict], Optional): Contain pcd and img's meta info.
-                Defaults to None.
-
-        Returns:
-            dict: Losses of PointRCNN RPN module.
-        """
-        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d)
-        (bbox_targets, mask_targets, positive_mask, negative_mask,
-         box_loss_weights, point_targets) = targets
-
-        # bbox loss
-        bbox_loss = self.bbox_loss(bbox_preds, bbox_targets,
-                                   box_loss_weights.unsqueeze(-1))
-        # calculate semantic loss
-        semantic_points = cls_preds.reshape(-1, self.num_classes)
-        semantic_targets = mask_targets
-        semantic_targets[negative_mask] = self.num_classes
-        semantic_points_label = semantic_targets
-        # for ignore, but now we do not have ignored label
-        semantic_loss_weight = negative_mask.float() + positive_mask.float()
-        semantic_loss = self.cls_loss(semantic_points,
-                                      semantic_points_label.reshape(-1),
-                                      semantic_loss_weight.reshape(-1))
-        semantic_loss /= positive_mask.float().sum()
-        losses = dict(bbox_loss=bbox_loss, semantic_loss=semantic_loss)
-
-        return losses
-
-    def get_targets(self, points, gt_bboxes_3d, gt_labels_3d):
-        """Generate targets of PointRCNN RPN head.
-
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of PointRCNN RPN head.
-        """
-        # find empty example
-        for index in range(len(gt_labels_3d)):
-            if len(gt_labels_3d[index]) == 0:
-                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
-                    1, gt_bboxes_3d[index].tensor.shape[-1])
-                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
-                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
-
-        (bbox_targets, mask_targets, positive_mask, negative_mask,
-         point_targets) = multi_apply(self.get_targets_single, points,
-                                      gt_bboxes_3d, gt_labels_3d)
-
-        bbox_targets = torch.stack(bbox_targets)
-        mask_targets = torch.stack(mask_targets)
-        positive_mask = torch.stack(positive_mask)
-        negative_mask = torch.stack(negative_mask)
-        box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6)
-
-        return (bbox_targets, mask_targets, positive_mask, negative_mask,
-                box_loss_weights, point_targets)
-
-    def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d):
-        """Generate targets of PointRCNN RPN head for single batch.
-
-        Args:
-            points (torch.Tensor): Points of each batch.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
-                boxes of each batch.
-            gt_labels_3d (torch.Tensor): Labels of each batch.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of ssd3d head.
-        """
-        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
-
-        valid_gt = gt_labels_3d != -1
-        gt_bboxes_3d = gt_bboxes_3d[valid_gt]
-        gt_labels_3d = gt_labels_3d[valid_gt]
-
-        # transform the bbox coordinate to the point cloud coordinate
-        gt_bboxes_3d_tensor = gt_bboxes_3d.tensor.clone()
-        gt_bboxes_3d_tensor[..., 2] += gt_bboxes_3d_tensor[..., 5] / 2
-
-        points_mask, assignment = self._assign_targets_by_points_inside(
-            gt_bboxes_3d, points)
-        gt_bboxes_3d_tensor = gt_bboxes_3d_tensor[assignment]
-        mask_targets = gt_labels_3d[assignment]
-
-        bbox_targets = self.bbox_coder.encode(gt_bboxes_3d_tensor,
-                                              points[..., 0:3], mask_targets)
-
-        positive_mask = (points_mask.max(1)[0] > 0)
-        # add ignore_mask
-        extend_gt_bboxes_3d = gt_bboxes_3d.enlarged_box(self.enlarge_width)
-        points_mask, _ = self._assign_targets_by_points_inside(
-            extend_gt_bboxes_3d, points)
-        negative_mask = (points_mask.max(1)[0] == 0)
-
-        point_targets = points[..., 0:3]
-        return (bbox_targets, mask_targets, positive_mask, negative_mask,
-                point_targets)
-
-    def get_bboxes(self,
-                   points,
-                   bbox_preds,
-                   cls_preds,
-                   input_metas,
-                   rescale=False):
-        """Generate bboxes from RPN head predictions.
-
-        Args:
-            points (torch.Tensor): Input points.
-            bbox_preds (dict): Regression predictions from PointRCNN head.
-            cls_preds (dict): Class scores predictions from PointRCNN head.
-            input_metas (list[dict]): Point cloud and image's meta info.
-            rescale (bool, optional): Whether to rescale bboxes.
-                Defaults to False.
-
-        Returns:
-            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
-        """
-        sem_scores = cls_preds.sigmoid()
-        obj_scores = sem_scores.max(-1)[0]
-        object_class = sem_scores.argmax(dim=-1)
-
-        batch_size = sem_scores.shape[0]
-        results = list()
-        for b in range(batch_size):
-            bbox3d = self.bbox_coder.decode(bbox_preds[b], points[b, ..., :3],
-                                            object_class[b])
-            bbox_selected, score_selected, labels, cls_preds_selected = \
-                self.class_agnostic_nms(obj_scores[b], sem_scores[b], bbox3d,
-                                        points[b, ..., :3], input_metas[b])
-            bbox = input_metas[b]['box_type_3d'](
-                bbox_selected.clone(),
-                box_dim=bbox_selected.shape[-1],
-                with_yaw=True)
-            results.append((bbox, score_selected, labels, cls_preds_selected))
-        return results
-
-    def class_agnostic_nms(self, obj_scores, sem_scores, bbox, points,
-                           input_meta):
-        """Class agnostic nms.
-
-        Args:
-            obj_scores (torch.Tensor): Objectness score of bounding boxes.
-            sem_scores (torch.Tensor): Semantic class score of bounding boxes.
-            bbox (torch.Tensor): Predicted bounding boxes.
-
-        Returns:
-            tuple[torch.Tensor]: Bounding boxes, scores and labels.
-        """
-        nms_cfg = self.test_cfg.nms_cfg if not self.training \
-            else self.train_cfg.nms_cfg
-        if nms_cfg.use_rotate_nms:
-            nms_func = nms_bev
-        else:
-            nms_func = nms_normal_bev
-
-        num_bbox = bbox.shape[0]
-        bbox = input_meta['box_type_3d'](
-            bbox.clone(),
-            box_dim=bbox.shape[-1],
-            with_yaw=True,
-            origin=(0.5, 0.5, 0.5))
-
-        if isinstance(bbox, LiDARInstance3DBoxes):
-            box_idx = bbox.points_in_boxes(points)
-            box_indices = box_idx.new_zeros([num_bbox + 1])
-            box_idx[box_idx == -1] = num_bbox
-            box_indices.scatter_add_(0, box_idx.long(),
-                                     box_idx.new_ones(box_idx.shape))
-            box_indices = box_indices[:-1]
-            nonempty_box_mask = box_indices >= 0
-        elif isinstance(bbox, DepthInstance3DBoxes):
-            box_indices = bbox.points_in_boxes(points)
-            nonempty_box_mask = box_indices.T.sum(1) >= 0
-        else:
-            raise NotImplementedError('Unsupported bbox type!')
-
-        bbox = bbox[nonempty_box_mask]
-
-        if self.test_cfg.score_thr is not None:
-            score_thr = self.test_cfg.score_thr
-            keep = (obj_scores >= score_thr)
-            obj_scores = obj_scores[keep]
-            sem_scores = sem_scores[keep]
-            bbox = bbox.tensor[keep]
-
-        if obj_scores.shape[0] > 0:
-            topk = min(nms_cfg.nms_pre, obj_scores.shape[0])
-            obj_scores_nms, indices = torch.topk(obj_scores, k=topk)
-            bbox_for_nms = xywhr2xyxyr(bbox[indices].bev)
-            sem_scores_nms = sem_scores[indices]
-
-            keep = nms_func(bbox_for_nms, obj_scores_nms, nms_cfg.iou_thr)
-            keep = keep[:nms_cfg.nms_post]
-
-            bbox_selected = bbox.tensor[indices][keep]
-            score_selected = obj_scores_nms[keep]
-            cls_preds = sem_scores_nms[keep]
-            labels = torch.argmax(cls_preds, -1)
-        else:
-            bbox_selected = bbox.tensor
-            score_selected = obj_scores.new_zeros([0])
-            labels = obj_scores.new_zeros([0])
-            cls_preds = obj_scores.new_zeros([0, sem_scores.shape[-1]])
-
-        return bbox_selected, score_selected, labels, cls_preds
-
-    def _assign_targets_by_points_inside(self, bboxes_3d, points):
-        """Compute assignment by checking whether point is inside bbox.
-
-        Args:
-            bboxes_3d (:obj:`BaseInstance3DBoxes`): Instance of bounding boxes.
-            points (torch.Tensor): Points of a batch.
-
-        Returns:
-            tuple[torch.Tensor]: Flags indicating whether each point is
-                inside bbox and the index of box where each point are in.
-        """
-        # TODO: align points_in_boxes function in each box_structures
-        num_bbox = bboxes_3d.tensor.shape[0]
-        if isinstance(bboxes_3d, LiDARInstance3DBoxes):
-            assignment = bboxes_3d.points_in_boxes(points[:, 0:3]).long()
-            points_mask = assignment.new_zeros(
-                [assignment.shape[0], num_bbox + 1])
-            assignment[assignment == -1] = num_bbox
-            points_mask.scatter_(1, assignment.unsqueeze(1), 1)
-            points_mask = points_mask[:, :-1]
-            assignment[assignment == num_bbox] = num_bbox - 1
-        elif isinstance(bboxes_3d, DepthInstance3DBoxes):
-            points_mask = bboxes_3d.points_in_boxes(points)
-            assignment = points_mask.argmax(dim=-1)
-        else:
-            raise NotImplementedError('Unsupported bbox type!')
-
-        return points_mask, assignment
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import BaseModule, force_fp32
+from torch import nn as nn
+
+from mmdet3d.core import xywhr2xyxyr
+from mmdet3d.core.bbox.structures import (DepthInstance3DBoxes,
+                                          LiDARInstance3DBoxes)
+from mmdet3d.core.post_processing import nms_bev, nms_normal_bev
+from mmdet.core import build_bbox_coder, multi_apply
+from ..builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class PointRPNHead(BaseModule):
+    """RPN module for PointRCNN.
+
+    Args:
+        num_classes (int): Number of classes.
+        train_cfg (dict): Train configs.
+        test_cfg (dict): Test configs.
+        pred_layer_cfg (dict, optional): Config of classification and
+            regression prediction layers. Defaults to None.
+        enlarge_width (float, optional): Enlarge bbox for each side to ignore
+            close points. Defaults to 0.1.
+        cls_loss (dict, optional): Config of direction classification loss.
+            Defaults to None.
+        bbox_loss (dict, optional): Config of localization loss.
+            Defaults to None.
+        bbox_coder (dict, optional): Config dict of box coders.
+            Defaults to None.
+        init_cfg (dict, optional): Config of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 train_cfg,
+                 test_cfg,
+                 pred_layer_cfg=None,
+                 enlarge_width=0.1,
+                 cls_loss=None,
+                 bbox_loss=None,
+                 bbox_coder=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.enlarge_width = enlarge_width
+
+        # build loss function
+        self.bbox_loss = build_loss(bbox_loss)
+        self.cls_loss = build_loss(cls_loss)
+
+        # build box coder
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+        # build pred conv
+        self.cls_layers = self._make_fc_layers(
+            fc_cfg=pred_layer_cfg.cls_linear_channels,
+            input_channels=pred_layer_cfg.in_channels,
+            output_channels=self._get_cls_out_channels())
+
+        self.reg_layers = self._make_fc_layers(
+            fc_cfg=pred_layer_cfg.reg_linear_channels,
+            input_channels=pred_layer_cfg.in_channels,
+            output_channels=self._get_reg_out_channels())
+
+    def _make_fc_layers(self, fc_cfg, input_channels, output_channels):
+        """Make fully connect layers.
+
+        Args:
+            fc_cfg (dict): Config of fully connect.
+            input_channels (int): Input channels for fc_layers.
+            output_channels (int): Input channels for fc_layers.
+
+        Returns:
+            nn.Sequential: Fully connect layers.
+        """
+        fc_layers = []
+        c_in = input_channels
+        for k in range(0, fc_cfg.__len__()):
+            fc_layers.extend([
+                nn.Linear(c_in, fc_cfg[k], bias=False),
+                nn.BatchNorm1d(fc_cfg[k]),
+                nn.ReLU(),
+            ])
+            c_in = fc_cfg[k]
+        fc_layers.append(nn.Linear(c_in, output_channels, bias=True))
+        return nn.Sequential(*fc_layers)
+
+    def _get_cls_out_channels(self):
+        """Return the channel number of classification outputs."""
+        # Class numbers (k) + objectness (1)
+        return self.num_classes
+
+    def _get_reg_out_channels(self):
+        """Return the channel number of regression outputs."""
+        # Bbox classification and regression
+        # (center residual (3), size regression (3)
+        # torch.cos(yaw) (1), torch.sin(yaw) (1)
+        return self.bbox_coder.code_size
+
+    def forward(self, feat_dict):
+        """Forward pass.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            tuple[list[torch.Tensor]]: Predicted boxes and classification
+                scores.
+        """
+        point_features = feat_dict['fp_features']
+        point_features = point_features.permute(0, 2, 1).contiguous()
+        batch_size = point_features.shape[0]
+        feat_cls = point_features.view(-1, point_features.shape[-1])
+        feat_reg = point_features.view(-1, point_features.shape[-1])
+
+        point_cls_preds = self.cls_layers(feat_cls).reshape(
+            batch_size, -1, self._get_cls_out_channels())
+        point_box_preds = self.reg_layers(feat_reg).reshape(
+            batch_size, -1, self._get_reg_out_channels())
+        return point_box_preds, point_cls_preds
+
+    @force_fp32(apply_to=('bbox_preds'))
+    def loss(self,
+             bbox_preds,
+             cls_preds,
+             points,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             img_metas=None):
+        """Compute loss.
+
+        Args:
+            bbox_preds (dict): Predictions from forward of PointRCNN RPN_Head.
+            cls_preds (dict): Classification from forward of PointRCNN
+                RPN_Head.
+            points (list[torch.Tensor]): Input points.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each sample.
+            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
+            img_metas (list[dict], Optional): Contain pcd and img's meta info.
+                Defaults to None.
+
+        Returns:
+            dict: Losses of PointRCNN RPN module.
+        """
+        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d)
+        (bbox_targets, mask_targets, positive_mask, negative_mask,
+         box_loss_weights, point_targets) = targets
+
+        # bbox loss
+        bbox_loss = self.bbox_loss(bbox_preds, bbox_targets,
+                                   box_loss_weights.unsqueeze(-1))
+        # calculate semantic loss
+        semantic_points = cls_preds.reshape(-1, self.num_classes)
+        semantic_targets = mask_targets
+        semantic_targets[negative_mask] = self.num_classes
+        semantic_points_label = semantic_targets
+        # for ignore, but now we do not have ignored label
+        semantic_loss_weight = negative_mask.float() + positive_mask.float()
+        semantic_loss = self.cls_loss(semantic_points,
+                                      semantic_points_label.reshape(-1),
+                                      semantic_loss_weight.reshape(-1))
+        semantic_loss /= positive_mask.float().sum()
+        losses = dict(bbox_loss=bbox_loss, semantic_loss=semantic_loss)
+
+        return losses
+
+    def get_targets(self, points, gt_bboxes_3d, gt_labels_3d):
+        """Generate targets of PointRCNN RPN head.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each batch.
+            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of PointRCNN RPN head.
+        """
+        # find empty example
+        for index in range(len(gt_labels_3d)):
+            if len(gt_labels_3d[index]) == 0:
+                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
+                    1, gt_bboxes_3d[index].tensor.shape[-1])
+                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
+                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
+
+        (bbox_targets, mask_targets, positive_mask, negative_mask,
+         point_targets) = multi_apply(self.get_targets_single, points,
+                                      gt_bboxes_3d, gt_labels_3d)
+
+        bbox_targets = torch.stack(bbox_targets)
+        mask_targets = torch.stack(mask_targets)
+        positive_mask = torch.stack(positive_mask)
+        negative_mask = torch.stack(negative_mask)
+        box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6)
+
+        return (bbox_targets, mask_targets, positive_mask, negative_mask,
+                box_loss_weights, point_targets)
+
+    def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d):
+        """Generate targets of PointRCNN RPN head for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of ssd3d head.
+        """
+        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
+
+        valid_gt = gt_labels_3d != -1
+        gt_bboxes_3d = gt_bboxes_3d[valid_gt]
+        gt_labels_3d = gt_labels_3d[valid_gt]
+
+        # transform the bbox coordinate to the point cloud coordinate
+        gt_bboxes_3d_tensor = gt_bboxes_3d.tensor.clone()
+        gt_bboxes_3d_tensor[..., 2] += gt_bboxes_3d_tensor[..., 5] / 2
+
+        points_mask, assignment = self._assign_targets_by_points_inside(
+            gt_bboxes_3d, points)
+        gt_bboxes_3d_tensor = gt_bboxes_3d_tensor[assignment]
+        mask_targets = gt_labels_3d[assignment]
+
+        bbox_targets = self.bbox_coder.encode(gt_bboxes_3d_tensor,
+                                              points[..., 0:3], mask_targets)
+
+        positive_mask = (points_mask.max(1)[0] > 0)
+        # add ignore_mask
+        extend_gt_bboxes_3d = gt_bboxes_3d.enlarged_box(self.enlarge_width)
+        points_mask, _ = self._assign_targets_by_points_inside(
+            extend_gt_bboxes_3d, points)
+        negative_mask = (points_mask.max(1)[0] == 0)
+
+        point_targets = points[..., 0:3]
+        return (bbox_targets, mask_targets, positive_mask, negative_mask,
+                point_targets)
+
+    def get_bboxes(self,
+                   points,
+                   bbox_preds,
+                   cls_preds,
+                   input_metas,
+                   rescale=False):
+        """Generate bboxes from RPN head predictions.
+
+        Args:
+            points (torch.Tensor): Input points.
+            bbox_preds (dict): Regression predictions from PointRCNN head.
+            cls_preds (dict): Class scores predictions from PointRCNN head.
+            input_metas (list[dict]): Point cloud and image's meta info.
+            rescale (bool, optional): Whether to rescale bboxes.
+                Defaults to False.
+
+        Returns:
+            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
+        """
+        sem_scores = cls_preds.sigmoid()
+        obj_scores = sem_scores.max(-1)[0]
+        object_class = sem_scores.argmax(dim=-1)
+
+        batch_size = sem_scores.shape[0]
+        results = list()
+        for b in range(batch_size):
+            bbox3d = self.bbox_coder.decode(bbox_preds[b], points[b, ..., :3],
+                                            object_class[b])
+            bbox_selected, score_selected, labels, cls_preds_selected = \
+                self.class_agnostic_nms(obj_scores[b], sem_scores[b], bbox3d,
+                                        points[b, ..., :3], input_metas[b])
+            bbox = input_metas[b]['box_type_3d'](
+                bbox_selected.clone(),
+                box_dim=bbox_selected.shape[-1],
+                with_yaw=True)
+            results.append((bbox, score_selected, labels, cls_preds_selected))
+        return results
+
+    def class_agnostic_nms(self, obj_scores, sem_scores, bbox, points,
+                           input_meta):
+        """Class agnostic nms.
+
+        Args:
+            obj_scores (torch.Tensor): Objectness score of bounding boxes.
+            sem_scores (torch.Tensor): Semantic class score of bounding boxes.
+            bbox (torch.Tensor): Predicted bounding boxes.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        nms_cfg = self.test_cfg.nms_cfg if not self.training \
+            else self.train_cfg.nms_cfg
+        if nms_cfg.use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+
+        num_bbox = bbox.shape[0]
+        bbox = input_meta['box_type_3d'](
+            bbox.clone(),
+            box_dim=bbox.shape[-1],
+            with_yaw=True,
+            origin=(0.5, 0.5, 0.5))
+
+        if isinstance(bbox, LiDARInstance3DBoxes):
+            box_idx = bbox.points_in_boxes(points)
+            box_indices = box_idx.new_zeros([num_bbox + 1])
+            box_idx[box_idx == -1] = num_bbox
+            box_indices.scatter_add_(0, box_idx.long(),
+                                     box_idx.new_ones(box_idx.shape))
+            box_indices = box_indices[:-1]
+            nonempty_box_mask = box_indices >= 0
+        elif isinstance(bbox, DepthInstance3DBoxes):
+            box_indices = bbox.points_in_boxes(points)
+            nonempty_box_mask = box_indices.T.sum(1) >= 0
+        else:
+            raise NotImplementedError('Unsupported bbox type!')
+
+        bbox = bbox[nonempty_box_mask]
+
+        if self.test_cfg.score_thr is not None:
+            score_thr = self.test_cfg.score_thr
+            keep = (obj_scores >= score_thr)
+            obj_scores = obj_scores[keep]
+            sem_scores = sem_scores[keep]
+            bbox = bbox.tensor[keep]
+
+        if obj_scores.shape[0] > 0:
+            topk = min(nms_cfg.nms_pre, obj_scores.shape[0])
+            obj_scores_nms, indices = torch.topk(obj_scores, k=topk)
+            bbox_for_nms = xywhr2xyxyr(bbox[indices].bev)
+            sem_scores_nms = sem_scores[indices]
+
+            keep = nms_func(bbox_for_nms, obj_scores_nms, nms_cfg.iou_thr)
+            keep = keep[:nms_cfg.nms_post]
+
+            bbox_selected = bbox.tensor[indices][keep]
+            score_selected = obj_scores_nms[keep]
+            cls_preds = sem_scores_nms[keep]
+            labels = torch.argmax(cls_preds, -1)
+        else:
+            bbox_selected = bbox.tensor
+            score_selected = obj_scores.new_zeros([0])
+            labels = obj_scores.new_zeros([0])
+            cls_preds = obj_scores.new_zeros([0, sem_scores.shape[-1]])
+
+        return bbox_selected, score_selected, labels, cls_preds
+
+    def _assign_targets_by_points_inside(self, bboxes_3d, points):
+        """Compute assignment by checking whether point is inside bbox.
+
+        Args:
+            bboxes_3d (:obj:`BaseInstance3DBoxes`): Instance of bounding boxes.
+            points (torch.Tensor): Points of a batch.
+
+        Returns:
+            tuple[torch.Tensor]: Flags indicating whether each point is
+                inside bbox and the index of box where each point are in.
+        """
+        # TODO: align points_in_boxes function in each box_structures
+        num_bbox = bboxes_3d.tensor.shape[0]
+        if isinstance(bboxes_3d, LiDARInstance3DBoxes):
+            assignment = bboxes_3d.points_in_boxes(points[:, 0:3]).long()
+            points_mask = assignment.new_zeros(
+                [assignment.shape[0], num_bbox + 1])
+            assignment[assignment == -1] = num_bbox
+            points_mask.scatter_(1, assignment.unsqueeze(1), 1)
+            points_mask = points_mask[:, :-1]
+            assignment[assignment == num_bbox] = num_bbox - 1
+        elif isinstance(bboxes_3d, DepthInstance3DBoxes):
+            points_mask = bboxes_3d.points_in_boxes(points)
+            assignment = points_mask.argmax(dim=-1)
+        else:
+            raise NotImplementedError('Unsupported bbox type!')
+
+        return points_mask, assignment
diff --git a/mmdet3d/models/dense_heads/shape_aware_head.py b/mmdet3d/models/dense_heads/shape_aware_head.py
index 6c55571..fdda31d 100644
--- a/mmdet3d/models/dense_heads/shape_aware_head.py
+++ b/mmdet3d/models/dense_heads/shape_aware_head.py
@@ -1,515 +1,515 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import numpy as np
-import torch
-from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule
-from torch import nn as nn
-
-from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr
-from mmdet.core import multi_apply
-from ..builder import HEADS, build_head
-from .anchor3d_head import Anchor3DHead
-
-
-@HEADS.register_module()
-class BaseShapeHead(BaseModule):
-    """Base Shape-aware Head in Shape Signature Network.
-
-    Note:
-        This base shape-aware grouping head uses default settings for small
-        objects. For large and huge objects, it is recommended to use
-        heavier heads, like (64, 64, 64) and (128, 128, 64, 64, 64) in
-        shared conv channels, (2, 1, 1) and (2, 1, 2, 1, 1) in shared
-        conv strides. For tiny objects, we can use smaller heads, like
-        (32, 32) channels and (1, 1) strides.
-
-    Args:
-        num_cls (int): Number of classes.
-        num_base_anchors (int): Number of anchors per location.
-        box_code_size (int): The dimension of boxes to be encoded.
-        in_channels (int): Input channels for convolutional layers.
-        shared_conv_channels (tuple, optional): Channels for shared
-            convolutional layers. Default: (64, 64).
-        shared_conv_strides (tuple, optional): Strides for shared
-            convolutional layers. Default: (1, 1).
-        use_direction_classifier (bool, optional): Whether to use direction
-            classifier. Default: True.
-        conv_cfg (dict, optional): Config of conv layer.
-            Default: dict(type='Conv2d')
-        norm_cfg (dict, optional): Config of norm layer.
-            Default: dict(type='BN2d').
-        bias (bool | str, optional): Type of bias. Default: False.
-    """
-
-    def __init__(self,
-                 num_cls,
-                 num_base_anchors,
-                 box_code_size,
-                 in_channels,
-                 shared_conv_channels=(64, 64),
-                 shared_conv_strides=(1, 1),
-                 use_direction_classifier=True,
-                 conv_cfg=dict(type='Conv2d'),
-                 norm_cfg=dict(type='BN2d'),
-                 bias=False,
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.num_cls = num_cls
-        self.num_base_anchors = num_base_anchors
-        self.use_direction_classifier = use_direction_classifier
-        self.box_code_size = box_code_size
-
-        assert len(shared_conv_channels) == len(shared_conv_strides), \
-            'Lengths of channels and strides list should be equal.'
-
-        self.shared_conv_channels = [in_channels] + list(shared_conv_channels)
-        self.shared_conv_strides = list(shared_conv_strides)
-
-        shared_conv = []
-        for i in range(len(self.shared_conv_strides)):
-            shared_conv.append(
-                ConvModule(
-                    self.shared_conv_channels[i],
-                    self.shared_conv_channels[i + 1],
-                    kernel_size=3,
-                    stride=self.shared_conv_strides[i],
-                    padding=1,
-                    conv_cfg=conv_cfg,
-                    bias=bias,
-                    norm_cfg=norm_cfg))
-
-        self.shared_conv = nn.Sequential(*shared_conv)
-
-        out_channels = self.shared_conv_channels[-1]
-        self.conv_cls = nn.Conv2d(out_channels, num_base_anchors * num_cls, 1)
-        self.conv_reg = nn.Conv2d(out_channels,
-                                  num_base_anchors * box_code_size, 1)
-
-        if use_direction_classifier:
-            self.conv_dir_cls = nn.Conv2d(out_channels, num_base_anchors * 2,
-                                          1)
-        if init_cfg is None:
-            if use_direction_classifier:
-                self.init_cfg = dict(
-                    type='Kaiming',
-                    layer='Conv2d',
-                    override=[
-                        dict(type='Normal', name='conv_reg', std=0.01),
-                        dict(
-                            type='Normal',
-                            name='conv_cls',
-                            std=0.01,
-                            bias_prob=0.01),
-                        dict(
-                            type='Normal',
-                            name='conv_dir_cls',
-                            std=0.01,
-                            bias_prob=0.01)
-                    ])
-            else:
-                self.init_cfg = dict(
-                    type='Kaiming',
-                    layer='Conv2d',
-                    override=[
-                        dict(type='Normal', name='conv_reg', std=0.01),
-                        dict(
-                            type='Normal',
-                            name='conv_cls',
-                            std=0.01,
-                            bias_prob=0.01)
-                    ])
-
-    def forward(self, x):
-        """Forward function for SmallHead.
-
-        Args:
-            x (torch.Tensor): Input feature map with the shape of
-                [B, C, H, W].
-
-        Returns:
-            dict[torch.Tensor]: Contain score of each class, bbox
-                regression and direction classification predictions.
-                Note that all the returned tensors are reshaped as
-                [bs*num_base_anchors*H*W, num_cls/box_code_size/dir_bins].
-                It is more convenient to concat anchors for different
-                classes even though they have different feature map sizes.
-        """
-        x = self.shared_conv(x)
-        cls_score = self.conv_cls(x)
-        bbox_pred = self.conv_reg(x)
-        featmap_size = bbox_pred.shape[-2:]
-        H, W = featmap_size
-        B = bbox_pred.shape[0]
-        cls_score = cls_score.view(-1, self.num_base_anchors, self.num_cls, H,
-                                   W).permute(0, 1, 3, 4,
-                                              2).reshape(B, -1, self.num_cls)
-        bbox_pred = bbox_pred.view(-1, self.num_base_anchors,
-                                   self.box_code_size, H, W).permute(
-                                       0, 1, 3, 4,
-                                       2).reshape(B, -1, self.box_code_size)
-
-        dir_cls_preds = None
-        if self.use_direction_classifier:
-            dir_cls_preds = self.conv_dir_cls(x)
-            dir_cls_preds = dir_cls_preds.view(-1, self.num_base_anchors, 2, H,
-                                               W).permute(0, 1, 3, 4,
-                                                          2).reshape(B, -1, 2)
-        ret = dict(
-            cls_score=cls_score,
-            bbox_pred=bbox_pred,
-            dir_cls_preds=dir_cls_preds,
-            featmap_size=featmap_size)
-        return ret
-
-
-@HEADS.register_module()
-class ShapeAwareHead(Anchor3DHead):
-    """Shape-aware grouping head for SSN.
-
-    Args:
-        tasks (dict): Shape-aware groups of multi-class objects.
-        assign_per_class (bool, optional): Whether to do assignment for each
-            class. Default: True.
-        kwargs (dict): Other arguments are the same as those in
-            :class:`Anchor3DHead`.
-    """
-
-    def __init__(self, tasks, assign_per_class=True, init_cfg=None, **kwargs):
-        self.tasks = tasks
-        self.featmap_sizes = []
-        super().__init__(
-            assign_per_class=assign_per_class, init_cfg=init_cfg, **kwargs)
-
-    def init_weights(self):
-        if not self._is_init:
-            for m in self.heads:
-                if hasattr(m, 'init_weights'):
-                    m.init_weights()
-            self._is_init = True
-        else:
-            warnings.warn(f'init_weights of {self.__class__.__name__} has '
-                          f'been called more than once.')
-
-    def _init_layers(self):
-        """Initialize neural network layers of the head."""
-        self.heads = nn.ModuleList()
-        cls_ptr = 0
-        for task in self.tasks:
-            sizes = self.anchor_generator.sizes[cls_ptr:cls_ptr +
-                                                task['num_class']]
-            num_size = torch.tensor(sizes).reshape(-1, 3).size(0)
-            num_rot = len(self.anchor_generator.rotations)
-            num_base_anchors = num_rot * num_size
-            branch = dict(
-                type='BaseShapeHead',
-                num_cls=self.num_classes,
-                num_base_anchors=num_base_anchors,
-                box_code_size=self.box_code_size,
-                in_channels=self.in_channels,
-                shared_conv_channels=task['shared_conv_channels'],
-                shared_conv_strides=task['shared_conv_strides'])
-            self.heads.append(build_head(branch))
-            cls_ptr += task['num_class']
-
-    def forward_single(self, x):
-        """Forward function on a single-scale feature map.
-
-        Args:
-            x (torch.Tensor): Input features.
-        Returns:
-            tuple[torch.Tensor]: Contain score of each class, bbox
-                regression and direction classification predictions.
-        """
-        results = []
-
-        for head in self.heads:
-            results.append(head(x))
-
-        cls_score = torch.cat([result['cls_score'] for result in results],
-                              dim=1)
-        bbox_pred = torch.cat([result['bbox_pred'] for result in results],
-                              dim=1)
-        dir_cls_preds = None
-        if self.use_direction_classifier:
-            dir_cls_preds = torch.cat(
-                [result['dir_cls_preds'] for result in results], dim=1)
-
-        self.featmap_sizes = []
-        for i, task in enumerate(self.tasks):
-            for _ in range(task['num_class']):
-                self.featmap_sizes.append(results[i]['featmap_size'])
-        assert len(self.featmap_sizes) == len(self.anchor_generator.ranges), \
-            'Length of feature map sizes must be equal to length of ' + \
-            'different ranges of anchor generator.'
-
-        return cls_score, bbox_pred, dir_cls_preds
-
-    def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels,
-                    label_weights, bbox_targets, bbox_weights, dir_targets,
-                    dir_weights, num_total_samples):
-        """Calculate loss of Single-level results.
-
-        Args:
-            cls_score (torch.Tensor): Class score in single-level.
-            bbox_pred (torch.Tensor): Bbox prediction in single-level.
-            dir_cls_preds (torch.Tensor): Predictions of direction class
-                in single-level.
-            labels (torch.Tensor): Labels of class.
-            label_weights (torch.Tensor): Weights of class loss.
-            bbox_targets (torch.Tensor): Targets of bbox predictions.
-            bbox_weights (torch.Tensor): Weights of bbox loss.
-            dir_targets (torch.Tensor): Targets of direction predictions.
-            dir_weights (torch.Tensor): Weights of direction loss.
-            num_total_samples (int): The number of valid samples.
-
-        Returns:
-            tuple[torch.Tensor]: Losses of class, bbox
-                and direction, respectively.
-        """
-        # classification loss
-        if num_total_samples is None:
-            num_total_samples = int(cls_score.shape[0])
-        labels = labels.reshape(-1)
-        label_weights = label_weights.reshape(-1)
-        cls_score = cls_score.reshape(-1, self.num_classes)
-        loss_cls = self.loss_cls(
-            cls_score, labels, label_weights, avg_factor=num_total_samples)
-
-        # regression loss
-        bbox_targets = bbox_targets.reshape(-1, self.box_code_size)
-        bbox_weights = bbox_weights.reshape(-1, self.box_code_size)
-        code_weight = self.train_cfg.get('code_weight', None)
-
-        if code_weight:
-            bbox_weights = bbox_weights * bbox_weights.new_tensor(code_weight)
-        bbox_pred = bbox_pred.reshape(-1, self.box_code_size)
-        if self.diff_rad_by_sin:
-            bbox_pred, bbox_targets = self.add_sin_difference(
-                bbox_pred, bbox_targets)
-        loss_bbox = self.loss_bbox(
-            bbox_pred,
-            bbox_targets,
-            bbox_weights,
-            avg_factor=num_total_samples)
-
-        # direction classification loss
-        loss_dir = None
-        if self.use_direction_classifier:
-            dir_cls_preds = dir_cls_preds.reshape(-1, 2)
-            dir_targets = dir_targets.reshape(-1)
-            dir_weights = dir_weights.reshape(-1)
-            loss_dir = self.loss_dir(
-                dir_cls_preds,
-                dir_targets,
-                dir_weights,
-                avg_factor=num_total_samples)
-
-        return loss_cls, loss_bbox, loss_dir
-
-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             dir_cls_preds,
-             gt_bboxes,
-             gt_labels,
-             input_metas,
-             gt_bboxes_ignore=None):
-        """Calculate losses.
-
-        Args:
-            cls_scores (list[torch.Tensor]): Multi-level class scores.
-            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
-            dir_cls_preds (list[torch.Tensor]): Multi-level direction
-                class predictions.
-            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Gt bboxes
-                of each sample.
-            gt_labels (list[torch.Tensor]): Gt labels of each sample.
-            input_metas (list[dict]): Contain pcd and img's meta info.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding.
-
-        Returns:
-            dict[str, list[torch.Tensor]]: Classification, bbox, and
-                direction losses of each level.
-
-                - loss_cls (list[torch.Tensor]): Classification losses.
-                - loss_bbox (list[torch.Tensor]): Box regression losses.
-                - loss_dir (list[torch.Tensor]): Direction classification
-                    losses.
-        """
-        device = cls_scores[0].device
-        anchor_list = self.get_anchors(
-            self.featmap_sizes, input_metas, device=device)
-        cls_reg_targets = self.anchor_target_3d(
-            anchor_list,
-            gt_bboxes,
-            input_metas,
-            gt_bboxes_ignore_list=gt_bboxes_ignore,
-            gt_labels_list=gt_labels,
-            num_classes=self.num_classes,
-            sampling=self.sampling)
-
-        if cls_reg_targets is None:
-            return None
-        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
-         dir_targets_list, dir_weights_list, num_total_pos,
-         num_total_neg) = cls_reg_targets
-        num_total_samples = (
-            num_total_pos + num_total_neg if self.sampling else num_total_pos)
-
-        # num_total_samples = None
-        losses_cls, losses_bbox, losses_dir = multi_apply(
-            self.loss_single,
-            cls_scores,
-            bbox_preds,
-            dir_cls_preds,
-            labels_list,
-            label_weights_list,
-            bbox_targets_list,
-            bbox_weights_list,
-            dir_targets_list,
-            dir_weights_list,
-            num_total_samples=num_total_samples)
-        return dict(
-            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir)
-
-    def get_bboxes(self,
-                   cls_scores,
-                   bbox_preds,
-                   dir_cls_preds,
-                   input_metas,
-                   cfg=None,
-                   rescale=False):
-        """Get bboxes of anchor head.
-
-        Args:
-            cls_scores (list[torch.Tensor]): Multi-level class scores.
-            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
-            dir_cls_preds (list[torch.Tensor]): Multi-level direction
-                class predictions.
-            input_metas (list[dict]): Contain pcd and img's meta info.
-            cfg (:obj:`ConfigDict`, optional): Training or testing config.
-                Default: None.
-            rescale (list[torch.Tensor], optional): Whether to rescale bbox.
-                Default: False.
-
-        Returns:
-            list[tuple]: Prediction resultes of batches.
-        """
-        assert len(cls_scores) == len(bbox_preds)
-        assert len(cls_scores) == len(dir_cls_preds)
-        num_levels = len(cls_scores)
-        assert num_levels == 1, 'Only support single level inference.'
-        device = cls_scores[0].device
-        mlvl_anchors = self.anchor_generator.grid_anchors(
-            self.featmap_sizes, device=device)
-        # `anchor` is a list of anchors for different classes
-        mlvl_anchors = [torch.cat(anchor, dim=0) for anchor in mlvl_anchors]
-
-        result_list = []
-        for img_id in range(len(input_metas)):
-            cls_score_list = [
-                cls_scores[i][img_id].detach() for i in range(num_levels)
-            ]
-            bbox_pred_list = [
-                bbox_preds[i][img_id].detach() for i in range(num_levels)
-            ]
-            dir_cls_pred_list = [
-                dir_cls_preds[i][img_id].detach() for i in range(num_levels)
-            ]
-
-            input_meta = input_metas[img_id]
-            proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list,
-                                               dir_cls_pred_list, mlvl_anchors,
-                                               input_meta, cfg, rescale)
-            result_list.append(proposals)
-        return result_list
-
-    def get_bboxes_single(self,
-                          cls_scores,
-                          bbox_preds,
-                          dir_cls_preds,
-                          mlvl_anchors,
-                          input_meta,
-                          cfg=None,
-                          rescale=False):
-        """Get bboxes of single branch.
-
-        Args:
-            cls_scores (torch.Tensor): Class score in single batch.
-            bbox_preds (torch.Tensor): Bbox prediction in single batch.
-            dir_cls_preds (torch.Tensor): Predictions of direction class
-                in single batch.
-            mlvl_anchors (List[torch.Tensor]): Multi-level anchors
-                in single batch.
-            input_meta (list[dict]): Contain pcd and img's meta info.
-            cfg (:obj:`ConfigDict`): Training or testing config.
-            rescale (list[torch.Tensor], optional): whether to rescale bbox.
-                Default: False.
-
-        Returns:
-            tuple: Contain predictions of single batch.
-
-                - bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
-                - scores (torch.Tensor): Class score of each bbox.
-                - labels (torch.Tensor): Label of each bbox.
-        """
-        cfg = self.test_cfg if cfg is None else cfg
-        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
-        mlvl_bboxes = []
-        mlvl_scores = []
-        mlvl_dir_scores = []
-        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
-                cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
-            assert cls_score.size()[-2] == bbox_pred.size()[-2]
-            assert cls_score.size()[-2] == dir_cls_pred.size()[-2]
-            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
-
-            if self.use_sigmoid_cls:
-                scores = cls_score.sigmoid()
-            else:
-                scores = cls_score.softmax(-1)
-
-            nms_pre = cfg.get('nms_pre', -1)
-            if nms_pre > 0 and scores.shape[0] > nms_pre:
-                if self.use_sigmoid_cls:
-                    max_scores, _ = scores.max(dim=1)
-                else:
-                    max_scores, _ = scores[:, :-1].max(dim=1)
-                _, topk_inds = max_scores.topk(nms_pre)
-                anchors = anchors[topk_inds, :]
-                bbox_pred = bbox_pred[topk_inds, :]
-                scores = scores[topk_inds, :]
-                dir_cls_score = dir_cls_score[topk_inds]
-
-            bboxes = self.bbox_coder.decode(anchors, bbox_pred)
-            mlvl_bboxes.append(bboxes)
-            mlvl_scores.append(scores)
-            mlvl_dir_scores.append(dir_cls_score)
-
-        mlvl_bboxes = torch.cat(mlvl_bboxes)
-        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
-            mlvl_bboxes, box_dim=self.box_code_size).bev)
-        mlvl_scores = torch.cat(mlvl_scores)
-        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
-
-        if self.use_sigmoid_cls:
-            # Add a dummy background class to the front when using sigmoid
-            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
-            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
-
-        score_thr = cfg.get('score_thr', 0)
-        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
-                                       mlvl_scores, score_thr, cfg.max_num,
-                                       cfg, mlvl_dir_scores)
-        bboxes, scores, labels, dir_scores = results
-        if bboxes.shape[0] > 0:
-            dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
-                                   self.dir_limit_offset, np.pi)
-            bboxes[..., 6] = (
-                dir_rot + self.dir_offset +
-                np.pi * dir_scores.to(bboxes.dtype))
-        bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
-        return bboxes, scores, labels
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from torch import nn as nn
+
+from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr
+from mmdet.core import multi_apply
+from ..builder import HEADS, build_head
+from .anchor3d_head import Anchor3DHead
+
+
+@HEADS.register_module()
+class BaseShapeHead(BaseModule):
+    """Base Shape-aware Head in Shape Signature Network.
+
+    Note:
+        This base shape-aware grouping head uses default settings for small
+        objects. For large and huge objects, it is recommended to use
+        heavier heads, like (64, 64, 64) and (128, 128, 64, 64, 64) in
+        shared conv channels, (2, 1, 1) and (2, 1, 2, 1, 1) in shared
+        conv strides. For tiny objects, we can use smaller heads, like
+        (32, 32) channels and (1, 1) strides.
+
+    Args:
+        num_cls (int): Number of classes.
+        num_base_anchors (int): Number of anchors per location.
+        box_code_size (int): The dimension of boxes to be encoded.
+        in_channels (int): Input channels for convolutional layers.
+        shared_conv_channels (tuple, optional): Channels for shared
+            convolutional layers. Default: (64, 64).
+        shared_conv_strides (tuple, optional): Strides for shared
+            convolutional layers. Default: (1, 1).
+        use_direction_classifier (bool, optional): Whether to use direction
+            classifier. Default: True.
+        conv_cfg (dict, optional): Config of conv layer.
+            Default: dict(type='Conv2d')
+        norm_cfg (dict, optional): Config of norm layer.
+            Default: dict(type='BN2d').
+        bias (bool | str, optional): Type of bias. Default: False.
+    """
+
+    def __init__(self,
+                 num_cls,
+                 num_base_anchors,
+                 box_code_size,
+                 in_channels,
+                 shared_conv_channels=(64, 64),
+                 shared_conv_strides=(1, 1),
+                 use_direction_classifier=True,
+                 conv_cfg=dict(type='Conv2d'),
+                 norm_cfg=dict(type='BN2d'),
+                 bias=False,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.num_cls = num_cls
+        self.num_base_anchors = num_base_anchors
+        self.use_direction_classifier = use_direction_classifier
+        self.box_code_size = box_code_size
+
+        assert len(shared_conv_channels) == len(shared_conv_strides), \
+            'Lengths of channels and strides list should be equal.'
+
+        self.shared_conv_channels = [in_channels] + list(shared_conv_channels)
+        self.shared_conv_strides = list(shared_conv_strides)
+
+        shared_conv = []
+        for i in range(len(self.shared_conv_strides)):
+            shared_conv.append(
+                ConvModule(
+                    self.shared_conv_channels[i],
+                    self.shared_conv_channels[i + 1],
+                    kernel_size=3,
+                    stride=self.shared_conv_strides[i],
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    bias=bias,
+                    norm_cfg=norm_cfg))
+
+        self.shared_conv = nn.Sequential(*shared_conv)
+
+        out_channels = self.shared_conv_channels[-1]
+        self.conv_cls = nn.Conv2d(out_channels, num_base_anchors * num_cls, 1)
+        self.conv_reg = nn.Conv2d(out_channels,
+                                  num_base_anchors * box_code_size, 1)
+
+        if use_direction_classifier:
+            self.conv_dir_cls = nn.Conv2d(out_channels, num_base_anchors * 2,
+                                          1)
+        if init_cfg is None:
+            if use_direction_classifier:
+                self.init_cfg = dict(
+                    type='Kaiming',
+                    layer='Conv2d',
+                    override=[
+                        dict(type='Normal', name='conv_reg', std=0.01),
+                        dict(
+                            type='Normal',
+                            name='conv_cls',
+                            std=0.01,
+                            bias_prob=0.01),
+                        dict(
+                            type='Normal',
+                            name='conv_dir_cls',
+                            std=0.01,
+                            bias_prob=0.01)
+                    ])
+            else:
+                self.init_cfg = dict(
+                    type='Kaiming',
+                    layer='Conv2d',
+                    override=[
+                        dict(type='Normal', name='conv_reg', std=0.01),
+                        dict(
+                            type='Normal',
+                            name='conv_cls',
+                            std=0.01,
+                            bias_prob=0.01)
+                    ])
+
+    def forward(self, x):
+        """Forward function for SmallHead.
+
+        Args:
+            x (torch.Tensor): Input feature map with the shape of
+                [B, C, H, W].
+
+        Returns:
+            dict[torch.Tensor]: Contain score of each class, bbox
+                regression and direction classification predictions.
+                Note that all the returned tensors are reshaped as
+                [bs*num_base_anchors*H*W, num_cls/box_code_size/dir_bins].
+                It is more convenient to concat anchors for different
+                classes even though they have different feature map sizes.
+        """
+        x = self.shared_conv(x)
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        featmap_size = bbox_pred.shape[-2:]
+        H, W = featmap_size
+        B = bbox_pred.shape[0]
+        cls_score = cls_score.view(-1, self.num_base_anchors, self.num_cls, H,
+                                   W).permute(0, 1, 3, 4,
+                                              2).reshape(B, -1, self.num_cls)
+        bbox_pred = bbox_pred.view(-1, self.num_base_anchors,
+                                   self.box_code_size, H, W).permute(
+                                       0, 1, 3, 4,
+                                       2).reshape(B, -1, self.box_code_size)
+
+        dir_cls_preds = None
+        if self.use_direction_classifier:
+            dir_cls_preds = self.conv_dir_cls(x)
+            dir_cls_preds = dir_cls_preds.view(-1, self.num_base_anchors, 2, H,
+                                               W).permute(0, 1, 3, 4,
+                                                          2).reshape(B, -1, 2)
+        ret = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            dir_cls_preds=dir_cls_preds,
+            featmap_size=featmap_size)
+        return ret
+
+
+@HEADS.register_module()
+class ShapeAwareHead(Anchor3DHead):
+    """Shape-aware grouping head for SSN.
+
+    Args:
+        tasks (dict): Shape-aware groups of multi-class objects.
+        assign_per_class (bool, optional): Whether to do assignment for each
+            class. Default: True.
+        kwargs (dict): Other arguments are the same as those in
+            :class:`Anchor3DHead`.
+    """
+
+    def __init__(self, tasks, assign_per_class=True, init_cfg=None, **kwargs):
+        self.tasks = tasks
+        self.featmap_sizes = []
+        super().__init__(
+            assign_per_class=assign_per_class, init_cfg=init_cfg, **kwargs)
+
+    def init_weights(self):
+        if not self._is_init:
+            for m in self.heads:
+                if hasattr(m, 'init_weights'):
+                    m.init_weights()
+            self._is_init = True
+        else:
+            warnings.warn(f'init_weights of {self.__class__.__name__} has '
+                          f'been called more than once.')
+
+    def _init_layers(self):
+        """Initialize neural network layers of the head."""
+        self.heads = nn.ModuleList()
+        cls_ptr = 0
+        for task in self.tasks:
+            sizes = self.anchor_generator.sizes[cls_ptr:cls_ptr +
+                                                task['num_class']]
+            num_size = torch.tensor(sizes).reshape(-1, 3).size(0)
+            num_rot = len(self.anchor_generator.rotations)
+            num_base_anchors = num_rot * num_size
+            branch = dict(
+                type='BaseShapeHead',
+                num_cls=self.num_classes,
+                num_base_anchors=num_base_anchors,
+                box_code_size=self.box_code_size,
+                in_channels=self.in_channels,
+                shared_conv_channels=task['shared_conv_channels'],
+                shared_conv_strides=task['shared_conv_strides'])
+            self.heads.append(build_head(branch))
+            cls_ptr += task['num_class']
+
+    def forward_single(self, x):
+        """Forward function on a single-scale feature map.
+
+        Args:
+            x (torch.Tensor): Input features.
+        Returns:
+            tuple[torch.Tensor]: Contain score of each class, bbox
+                regression and direction classification predictions.
+        """
+        results = []
+
+        for head in self.heads:
+            results.append(head(x))
+
+        cls_score = torch.cat([result['cls_score'] for result in results],
+                              dim=1)
+        bbox_pred = torch.cat([result['bbox_pred'] for result in results],
+                              dim=1)
+        dir_cls_preds = None
+        if self.use_direction_classifier:
+            dir_cls_preds = torch.cat(
+                [result['dir_cls_preds'] for result in results], dim=1)
+
+        self.featmap_sizes = []
+        for i, task in enumerate(self.tasks):
+            for _ in range(task['num_class']):
+                self.featmap_sizes.append(results[i]['featmap_size'])
+        assert len(self.featmap_sizes) == len(self.anchor_generator.ranges), \
+            'Length of feature map sizes must be equal to length of ' + \
+            'different ranges of anchor generator.'
+
+        return cls_score, bbox_pred, dir_cls_preds
+
+    def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels,
+                    label_weights, bbox_targets, bbox_weights, dir_targets,
+                    dir_weights, num_total_samples):
+        """Calculate loss of Single-level results.
+
+        Args:
+            cls_score (torch.Tensor): Class score in single-level.
+            bbox_pred (torch.Tensor): Bbox prediction in single-level.
+            dir_cls_preds (torch.Tensor): Predictions of direction class
+                in single-level.
+            labels (torch.Tensor): Labels of class.
+            label_weights (torch.Tensor): Weights of class loss.
+            bbox_targets (torch.Tensor): Targets of bbox predictions.
+            bbox_weights (torch.Tensor): Weights of bbox loss.
+            dir_targets (torch.Tensor): Targets of direction predictions.
+            dir_weights (torch.Tensor): Weights of direction loss.
+            num_total_samples (int): The number of valid samples.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of class, bbox
+                and direction, respectively.
+        """
+        # classification loss
+        if num_total_samples is None:
+            num_total_samples = int(cls_score.shape[0])
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.reshape(-1, self.num_classes)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+
+        # regression loss
+        bbox_targets = bbox_targets.reshape(-1, self.box_code_size)
+        bbox_weights = bbox_weights.reshape(-1, self.box_code_size)
+        code_weight = self.train_cfg.get('code_weight', None)
+
+        if code_weight:
+            bbox_weights = bbox_weights * bbox_weights.new_tensor(code_weight)
+        bbox_pred = bbox_pred.reshape(-1, self.box_code_size)
+        if self.diff_rad_by_sin:
+            bbox_pred, bbox_targets = self.add_sin_difference(
+                bbox_pred, bbox_targets)
+        loss_bbox = self.loss_bbox(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            avg_factor=num_total_samples)
+
+        # direction classification loss
+        loss_dir = None
+        if self.use_direction_classifier:
+            dir_cls_preds = dir_cls_preds.reshape(-1, 2)
+            dir_targets = dir_targets.reshape(-1)
+            dir_weights = dir_weights.reshape(-1)
+            loss_dir = self.loss_dir(
+                dir_cls_preds,
+                dir_targets,
+                dir_weights,
+                avg_factor=num_total_samples)
+
+        return loss_cls, loss_bbox, loss_dir
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             dir_cls_preds,
+             gt_bboxes,
+             gt_labels,
+             input_metas,
+             gt_bboxes_ignore=None):
+        """Calculate losses.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Multi-level class scores.
+            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
+            dir_cls_preds (list[torch.Tensor]): Multi-level direction
+                class predictions.
+            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Gt bboxes
+                of each sample.
+            gt_labels (list[torch.Tensor]): Gt labels of each sample.
+            input_metas (list[dict]): Contain pcd and img's meta info.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding.
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Classification, bbox, and
+                direction losses of each level.
+
+                - loss_cls (list[torch.Tensor]): Classification losses.
+                - loss_bbox (list[torch.Tensor]): Box regression losses.
+                - loss_dir (list[torch.Tensor]): Direction classification
+                    losses.
+        """
+        device = cls_scores[0].device
+        anchor_list = self.get_anchors(
+            self.featmap_sizes, input_metas, device=device)
+        cls_reg_targets = self.anchor_target_3d(
+            anchor_list,
+            gt_bboxes,
+            input_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            num_classes=self.num_classes,
+            sampling=self.sampling)
+
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         dir_targets_list, dir_weights_list, num_total_pos,
+         num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # num_total_samples = None
+        losses_cls, losses_bbox, losses_dir = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            dir_cls_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            dir_targets_list,
+            dir_weights_list,
+            num_total_samples=num_total_samples)
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir)
+
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   dir_cls_preds,
+                   input_metas,
+                   cfg=None,
+                   rescale=False):
+        """Get bboxes of anchor head.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Multi-level class scores.
+            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
+            dir_cls_preds (list[torch.Tensor]): Multi-level direction
+                class predictions.
+            input_metas (list[dict]): Contain pcd and img's meta info.
+            cfg (:obj:`ConfigDict`, optional): Training or testing config.
+                Default: None.
+            rescale (list[torch.Tensor], optional): Whether to rescale bbox.
+                Default: False.
+
+        Returns:
+            list[tuple]: Prediction resultes of batches.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        assert len(cls_scores) == len(dir_cls_preds)
+        num_levels = len(cls_scores)
+        assert num_levels == 1, 'Only support single level inference.'
+        device = cls_scores[0].device
+        mlvl_anchors = self.anchor_generator.grid_anchors(
+            self.featmap_sizes, device=device)
+        # `anchor` is a list of anchors for different classes
+        mlvl_anchors = [torch.cat(anchor, dim=0) for anchor in mlvl_anchors]
+
+        result_list = []
+        for img_id in range(len(input_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            dir_cls_pred_list = [
+                dir_cls_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+
+            input_meta = input_metas[img_id]
+            proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list,
+                                               dir_cls_pred_list, mlvl_anchors,
+                                               input_meta, cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def get_bboxes_single(self,
+                          cls_scores,
+                          bbox_preds,
+                          dir_cls_preds,
+                          mlvl_anchors,
+                          input_meta,
+                          cfg=None,
+                          rescale=False):
+        """Get bboxes of single branch.
+
+        Args:
+            cls_scores (torch.Tensor): Class score in single batch.
+            bbox_preds (torch.Tensor): Bbox prediction in single batch.
+            dir_cls_preds (torch.Tensor): Predictions of direction class
+                in single batch.
+            mlvl_anchors (List[torch.Tensor]): Multi-level anchors
+                in single batch.
+            input_meta (list[dict]): Contain pcd and img's meta info.
+            cfg (:obj:`ConfigDict`): Training or testing config.
+            rescale (list[torch.Tensor], optional): whether to rescale bbox.
+                Default: False.
+
+        Returns:
+            tuple: Contain predictions of single batch.
+
+                - bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
+                - scores (torch.Tensor): Class score of each bbox.
+                - labels (torch.Tensor): Label of each bbox.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
+                cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
+            assert cls_score.size()[-2] == bbox_pred.size()[-2]
+            assert cls_score.size()[-2] == dir_cls_pred.size()[-2]
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_score = dir_cls_score[topk_inds]
+
+            bboxes = self.bbox_coder.decode(anchors, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            mlvl_bboxes, box_dim=self.box_code_size).bev)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the front when using sigmoid
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+
+        score_thr = cfg.get('score_thr', 0)
+        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                       mlvl_scores, score_thr, cfg.max_num,
+                                       cfg, mlvl_dir_scores)
+        bboxes, scores, labels, dir_scores = results
+        if bboxes.shape[0] > 0:
+            dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
+                                   self.dir_limit_offset, np.pi)
+            bboxes[..., 6] = (
+                dir_rot + self.dir_offset +
+                np.pi * dir_scores.to(bboxes.dtype))
+        bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
+        return bboxes, scores, labels
diff --git a/mmdet3d/models/dense_heads/smoke_mono3d_head.py b/mmdet3d/models/dense_heads/smoke_mono3d_head.py
index 3459e09..ca14a43 100644
--- a/mmdet3d/models/dense_heads/smoke_mono3d_head.py
+++ b/mmdet3d/models/dense_heads/smoke_mono3d_head.py
@@ -1,516 +1,516 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch.nn import functional as F
-
-from mmdet.core import multi_apply
-from mmdet.core.bbox.builder import build_bbox_coder
-from mmdet.models.utils import gaussian_radius, gen_gaussian_target
-from mmdet.models.utils.gaussian_target import (get_local_maximum,
-                                                get_topk_from_heatmap,
-                                                transpose_and_gather_feat)
-from ..builder import HEADS
-from .anchor_free_mono3d_head import AnchorFreeMono3DHead
-
-
-@HEADS.register_module()
-class SMOKEMono3DHead(AnchorFreeMono3DHead):
-    r"""Anchor-free head used in `SMOKE <https://arxiv.org/abs/2002.10111>`_
-
-    .. code-block:: none
-
-                /-----> 3*3 conv -----> 1*1 conv -----> cls
-        feature
-                \-----> 3*3 conv -----> 1*1 conv -----> reg
-
-    Args:
-        num_classes (int): Number of categories excluding the background
-            category.
-        in_channels (int): Number of channels in the input feature map.
-        dim_channel (list[int]): indices of dimension offset preds in
-            regression heatmap channels.
-        ori_channel (list[int]): indices of orientation offset pred in
-            regression heatmap channels.
-        bbox_coder (:obj:`CameraInstance3DBoxes`): Bbox coder
-            for encoding and decoding boxes.
-        loss_cls (dict, optional): Config of classification loss.
-            Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
-        loss_bbox (dict, optional): Config of localization loss.
-            Default: loss_bbox=dict(type='L1Loss', loss_weight=10.0).
-        loss_dir (dict, optional): Config of direction classification loss.
-            In SMOKE, Default: None.
-        loss_attr (dict, optional): Config of attribute classification loss.
-            In SMOKE, Default: None.
-        loss_centerness (dict): Config of centerness loss.
-        norm_cfg (dict): Dictionary to construct and config norm layer.
-            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
-        init_cfg (dict): Initialization config dict. Default: None.
-    """  # noqa: E501
-
-    def __init__(self,
-                 num_classes,
-                 in_channels,
-                 dim_channel,
-                 ori_channel,
-                 bbox_coder,
-                 loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0),
-                 loss_bbox=dict(type='L1Loss', loss_weight=0.1),
-                 loss_dir=None,
-                 loss_attr=None,
-                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
-                 init_cfg=None,
-                 **kwargs):
-        super().__init__(
-            num_classes,
-            in_channels,
-            loss_cls=loss_cls,
-            loss_bbox=loss_bbox,
-            loss_dir=loss_dir,
-            loss_attr=loss_attr,
-            norm_cfg=norm_cfg,
-            init_cfg=init_cfg,
-            **kwargs)
-        self.dim_channel = dim_channel
-        self.ori_channel = ori_channel
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-
-    def forward(self, feats):
-        """Forward features from the upstream network.
-
-        Args:
-            feats (tuple[Tensor]): Features from the upstream network, each is
-                a 4D-tensor.
-
-        Returns:
-            tuple:
-                cls_scores (list[Tensor]): Box scores for each scale level,
-                    each is a 4D-tensor, the channel number is
-                    num_points * num_classes.
-                bbox_preds (list[Tensor]): Box energies / deltas for each scale
-                    level, each is a 4D-tensor, the channel number is
-                    num_points * bbox_code_size.
-        """
-        return multi_apply(self.forward_single, feats)
-
-    def forward_single(self, x):
-        """Forward features of a single scale level.
-
-        Args:
-            x (Tensor): Input feature map.
-
-        Returns:
-            tuple: Scores for each class, bbox of input feature maps.
-        """
-        cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \
-            super().forward_single(x)
-        cls_score = cls_score.sigmoid()  # turn to 0-1
-        cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4)
-        # (N, C, H, W)
-        offset_dims = bbox_pred[:, self.dim_channel, ...]
-        bbox_pred[:, self.dim_channel, ...] = offset_dims.sigmoid() - 0.5
-        # (N, C, H, W)
-        vector_ori = bbox_pred[:, self.ori_channel, ...]
-        bbox_pred[:, self.ori_channel, ...] = F.normalize(vector_ori)
-        return cls_score, bbox_pred
-
-    def get_bboxes(self, cls_scores, bbox_preds, img_metas, rescale=None):
-        """Generate bboxes from bbox head predictions.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level.
-            bbox_preds (list[Tensor]): Box regression for each scale.
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            rescale (bool): If True, return boxes in original image space.
-
-        Returns:
-            list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
-                Each item in result_list is 4-tuple.
-        """
-        assert len(cls_scores) == len(bbox_preds) == 1
-        cam2imgs = torch.stack([
-            cls_scores[0].new_tensor(img_meta['cam2img'])
-            for img_meta in img_metas
-        ])
-        trans_mats = torch.stack([
-            cls_scores[0].new_tensor(img_meta['trans_mat'])
-            for img_meta in img_metas
-        ])
-        batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap(
-            cls_scores[0],
-            bbox_preds[0],
-            img_metas,
-            cam2imgs=cam2imgs,
-            trans_mats=trans_mats,
-            topk=100,
-            kernel=3)
-
-        result_list = []
-        for img_id in range(len(img_metas)):
-
-            bboxes = batch_bboxes[img_id]
-            scores = batch_scores[img_id]
-            labels = batch_topk_labels[img_id]
-
-            keep_idx = scores > 0.25
-            bboxes = bboxes[keep_idx]
-            scores = scores[keep_idx]
-            labels = labels[keep_idx]
-
-            bboxes = img_metas[img_id]['box_type_3d'](
-                bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
-            attrs = None
-            result_list.append((bboxes, scores, labels, attrs))
-
-        return result_list
-
-    def decode_heatmap(self,
-                       cls_score,
-                       reg_pred,
-                       img_metas,
-                       cam2imgs,
-                       trans_mats,
-                       topk=100,
-                       kernel=3):
-        """Transform outputs into detections raw bbox predictions.
-
-        Args:
-            class_score (Tensor): Center predict heatmap,
-                shape (B, num_classes, H, W).
-            reg_pred (Tensor): Box regression map.
-                shape (B, channel, H , W).
-            img_metas (List[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            cam2imgs (Tensor): Camera intrinsic matrixs.
-                shape (B, 4, 4)
-            trans_mats (Tensor): Transformation matrix from original image
-                to feature map.
-                shape: (batch, 3, 3)
-            topk (int): Get top k center keypoints from heatmap. Default 100.
-            kernel (int): Max pooling kernel for extract local maximum pixels.
-               Default 3.
-
-        Returns:
-            tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
-               the following Tensors:
-              - batch_bboxes (Tensor): Coords of each 3D box.
-                    shape (B, k, 7)
-              - batch_scores (Tensor): Scores of each 3D box.
-                    shape (B, k)
-              - batch_topk_labels (Tensor): Categories of each 3D box.
-                    shape (B, k)
-        """
-        img_h, img_w = img_metas[0]['pad_shape'][:2]
-        bs, _, feat_h, feat_w = cls_score.shape
-
-        center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel)
-
-        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
-            center_heatmap_pred, k=topk)
-        batch_scores, batch_index, batch_topk_labels = batch_dets
-
-        regression = transpose_and_gather_feat(reg_pred, batch_index)
-        regression = regression.view(-1, 8)
-
-        points = torch.cat([topk_xs.view(-1, 1),
-                            topk_ys.view(-1, 1).float()],
-                           dim=1)
-        locations, dimensions, orientations = self.bbox_coder.decode(
-            regression, points, batch_topk_labels, cam2imgs, trans_mats)
-
-        batch_bboxes = torch.cat((locations, dimensions, orientations), dim=1)
-        batch_bboxes = batch_bboxes.view(bs, -1, self.bbox_code_size)
-        return batch_bboxes, batch_scores, batch_topk_labels
-
-    def get_predictions(self, labels3d, centers2d, gt_locations, gt_dimensions,
-                        gt_orientations, indices, img_metas, pred_reg):
-        """Prepare predictions for computing loss.
-
-        Args:
-            labels3d (Tensor): Labels of each 3D box.
-                shape (B, max_objs, )
-            centers2d (Tensor): Coords of each projected 3D box
-                center on image. shape (B * max_objs, 2)
-            gt_locations (Tensor): Coords of each 3D box's location.
-                shape (B * max_objs, 3)
-            gt_dimensions (Tensor): Dimensions of each 3D box.
-                shape (N, 3)
-            gt_orientations (Tensor): Orientation(yaw) of each 3D box.
-                shape (N, 1)
-            indices (Tensor): Indices of the existence of the 3D box.
-                shape (B * max_objs, )
-            img_metas (list[dict]): Meta information of each image,
-                e.g., image size, scaling factor, etc.
-            pre_reg (Tensor): Box regression map.
-                shape (B, channel, H , W).
-
-        Returns:
-            dict: the dict has components below:
-            - bbox3d_yaws (:obj:`CameraInstance3DBoxes`):
-                bbox calculated using pred orientations.
-            - bbox3d_dims (:obj:`CameraInstance3DBoxes`):
-                bbox calculated using pred dimensions.
-            - bbox3d_locs (:obj:`CameraInstance3DBoxes`):
-                bbox calculated using pred locations.
-        """
-        batch, channel = pred_reg.shape[0], pred_reg.shape[1]
-        w = pred_reg.shape[3]
-        cam2imgs = torch.stack([
-            gt_locations.new_tensor(img_meta['cam2img'])
-            for img_meta in img_metas
-        ])
-        trans_mats = torch.stack([
-            gt_locations.new_tensor(img_meta['trans_mat'])
-            for img_meta in img_metas
-        ])
-        centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0]
-        centers2d_inds = centers2d_inds.view(batch, -1)
-        pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds)
-        pred_regression_pois = pred_regression.view(-1, channel)
-        locations, dimensions, orientations = self.bbox_coder.decode(
-            pred_regression_pois, centers2d, labels3d, cam2imgs, trans_mats,
-            gt_locations)
-
-        locations, dimensions, orientations = locations[indices], dimensions[
-            indices], orientations[indices]
-
-        locations[:, 1] += dimensions[:, 1] / 2
-
-        gt_locations = gt_locations[indices]
-
-        assert len(locations) == len(gt_locations)
-        assert len(dimensions) == len(gt_dimensions)
-        assert len(orientations) == len(gt_orientations)
-        bbox3d_yaws = self.bbox_coder.encode(gt_locations, gt_dimensions,
-                                             orientations, img_metas)
-        bbox3d_dims = self.bbox_coder.encode(gt_locations, dimensions,
-                                             gt_orientations, img_metas)
-        bbox3d_locs = self.bbox_coder.encode(locations, gt_dimensions,
-                                             gt_orientations, img_metas)
-
-        pred_bboxes = dict(ori=bbox3d_yaws, dim=bbox3d_dims, loc=bbox3d_locs)
-
-        return pred_bboxes
-
-    def get_targets(self, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d,
-                    centers2d, feat_shape, img_shape, img_metas):
-        """Get training targets for batch images.
-
-        Args:
-            gt_bboxes (list[Tensor]): Ground truth bboxes of each image,
-                shape (num_gt, 4).
-            gt_labels (list[Tensor]): Ground truth labels of each box,
-                shape (num_gt,).
-            gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D Ground
-                truth bboxes of each image,
-                shape (num_gt, bbox_code_size).
-            gt_labels_3d (list[Tensor]): 3D Ground truth labels of each
-                box, shape (num_gt,).
-            centers2d (list[Tensor]): Projected 3D centers onto 2D image,
-                shape (num_gt, 2).
-            feat_shape (tuple[int]): Feature map shape with value,
-                shape (B, _, H, W).
-            img_shape (tuple[int]): Image shape in [h, w] format.
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-
-        Returns:
-            tuple[Tensor, dict]: The Tensor value is the targets of
-                center heatmap, the dict has components below:
-              - gt_centers2d (Tensor): Coords of each projected 3D box
-                    center on image. shape (B * max_objs, 2)
-              - gt_labels3d (Tensor): Labels of each 3D box.
-                    shape (B, max_objs, )
-              - indices (Tensor): Indices of the existence of the 3D box.
-                    shape (B * max_objs, )
-              - affine_indices (Tensor): Indices of the affine of the 3D box.
-                    shape (N, )
-              - gt_locs (Tensor): Coords of each 3D box's location.
-                    shape (N, 3)
-              - gt_dims (Tensor): Dimensions of each 3D box.
-                    shape (N, 3)
-              - gt_yaws (Tensor): Orientation(yaw) of each 3D box.
-                    shape (N, 1)
-              - gt_cors (Tensor): Coords of the corners of each 3D box.
-                    shape (N, 8, 3)
-        """
-
-        reg_mask = torch.stack([
-            gt_bboxes[0].new_tensor(
-                not img_meta['affine_aug'], dtype=torch.bool)
-            for img_meta in img_metas
-        ])
-
-        img_h, img_w = img_shape[:2]
-        bs, _, feat_h, feat_w = feat_shape
-
-        width_ratio = float(feat_w / img_w)  # 1/4
-        height_ratio = float(feat_h / img_h)  # 1/4
-
-        assert width_ratio == height_ratio
-
-        center_heatmap_target = gt_bboxes[-1].new_zeros(
-            [bs, self.num_classes, feat_h, feat_w])
-
-        gt_centers2d = centers2d.copy()
-
-        for batch_id in range(bs):
-            gt_bbox = gt_bboxes[batch_id]
-            gt_label = gt_labels[batch_id]
-            # project centers2d from input image to feat map
-            gt_center2d = gt_centers2d[batch_id] * width_ratio
-
-            for j, center in enumerate(gt_center2d):
-                center_x_int, center_y_int = center.int()
-                scale_box_h = (gt_bbox[j][3] - gt_bbox[j][1]) * height_ratio
-                scale_box_w = (gt_bbox[j][2] - gt_bbox[j][0]) * width_ratio
-                radius = gaussian_radius([scale_box_h, scale_box_w],
-                                         min_overlap=0.7)
-                radius = max(0, int(radius))
-                ind = gt_label[j]
-                gen_gaussian_target(center_heatmap_target[batch_id, ind],
-                                    [center_x_int, center_y_int], radius)
-
-        avg_factor = max(1, center_heatmap_target.eq(1).sum())
-        num_ctrs = [center2d.shape[0] for center2d in centers2d]
-        max_objs = max(num_ctrs)
-
-        reg_inds = torch.cat(
-            [reg_mask[i].repeat(num_ctrs[i]) for i in range(bs)])
-
-        inds = torch.zeros((bs, max_objs),
-                           dtype=torch.bool).to(centers2d[0].device)
-
-        # put gt 3d bboxes to gpu
-        gt_bboxes_3d = [
-            gt_bbox_3d.to(centers2d[0].device) for gt_bbox_3d in gt_bboxes_3d
-        ]
-
-        batch_centers2d = centers2d[0].new_zeros((bs, max_objs, 2))
-        batch_labels_3d = gt_labels_3d[0].new_zeros((bs, max_objs))
-        batch_gt_locations = \
-            gt_bboxes_3d[0].tensor.new_zeros((bs, max_objs, 3))
-        for i in range(bs):
-            inds[i, :num_ctrs[i]] = 1
-            batch_centers2d[i, :num_ctrs[i]] = centers2d[i]
-            batch_labels_3d[i, :num_ctrs[i]] = gt_labels_3d[i]
-            batch_gt_locations[i, :num_ctrs[i]] = \
-                gt_bboxes_3d[i].tensor[:, :3]
-
-        inds = inds.flatten()
-        batch_centers2d = batch_centers2d.view(-1, 2) * width_ratio
-        batch_gt_locations = batch_gt_locations.view(-1, 3)
-
-        # filter the empty image, without gt_bboxes_3d
-        gt_bboxes_3d = [
-            gt_bbox_3d for gt_bbox_3d in gt_bboxes_3d
-            if gt_bbox_3d.tensor.shape[0] > 0
-        ]
-
-        gt_dimensions = torch.cat(
-            [gt_bbox_3d.tensor[:, 3:6] for gt_bbox_3d in gt_bboxes_3d])
-        gt_orientations = torch.cat([
-            gt_bbox_3d.tensor[:, 6].unsqueeze(-1)
-            for gt_bbox_3d in gt_bboxes_3d
-        ])
-        gt_corners = torch.cat(
-            [gt_bbox_3d.corners for gt_bbox_3d in gt_bboxes_3d])
-
-        target_labels = dict(
-            gt_centers2d=batch_centers2d.long(),
-            gt_labels3d=batch_labels_3d,
-            indices=inds,
-            reg_indices=reg_inds,
-            gt_locs=batch_gt_locations,
-            gt_dims=gt_dimensions,
-            gt_yaws=gt_orientations,
-            gt_cors=gt_corners)
-
-        return center_heatmap_target, avg_factor, target_labels
-
-    def loss(self,
-             cls_scores,
-             bbox_preds,
-             gt_bboxes,
-             gt_labels,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             centers2d,
-             depths,
-             attr_labels,
-             img_metas,
-             gt_bboxes_ignore=None):
-        """Compute loss of the head.
-
-        Args:
-            cls_scores (list[Tensor]): Box scores for each scale level.
-                shape (num_gt, 4).
-            bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
-                number is bbox_code_size.
-                shape (B, 7, H, W).
-            gt_bboxes (list[Tensor]): Ground truth bboxes for each image.
-                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[Tensor]): Class indices corresponding to each box.
-                shape (num_gts, ).
-            gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground
-                truth. it is the flipped gt_bboxes
-            gt_labels_3d (list[Tensor]): Same as gt_labels.
-            centers2d (list[Tensor]): 2D centers on the image.
-                shape (num_gts, 2).
-            depths (list[Tensor]): Depth ground truth.
-                shape (num_gts, ).
-            attr_labels (list[Tensor]): Attributes indices of each box.
-                In kitti it's None.
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
-                boxes can be ignored when computing the loss.
-                Default: None.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-        assert len(cls_scores) == len(bbox_preds) == 1
-        assert attr_labels is None
-        assert gt_bboxes_ignore is None
-        center2d_heatmap = cls_scores[0]
-        pred_reg = bbox_preds[0]
-
-        center2d_heatmap_target, avg_factor, target_labels = \
-            self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d,
-                             gt_labels_3d, centers2d,
-                             center2d_heatmap.shape,
-                             img_metas[0]['pad_shape'],
-                             img_metas)
-
-        pred_bboxes = self.get_predictions(
-            labels3d=target_labels['gt_labels3d'],
-            centers2d=target_labels['gt_centers2d'],
-            gt_locations=target_labels['gt_locs'],
-            gt_dimensions=target_labels['gt_dims'],
-            gt_orientations=target_labels['gt_yaws'],
-            indices=target_labels['indices'],
-            img_metas=img_metas,
-            pred_reg=pred_reg)
-
-        loss_cls = self.loss_cls(
-            center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor)
-
-        reg_inds = target_labels['reg_indices']
-
-        loss_bbox_oris = self.loss_bbox(
-            pred_bboxes['ori'].corners[reg_inds, ...],
-            target_labels['gt_cors'][reg_inds, ...])
-
-        loss_bbox_dims = self.loss_bbox(
-            pred_bboxes['dim'].corners[reg_inds, ...],
-            target_labels['gt_cors'][reg_inds, ...])
-
-        loss_bbox_locs = self.loss_bbox(
-            pred_bboxes['loc'].corners[reg_inds, ...],
-            target_labels['gt_cors'][reg_inds, ...])
-
-        loss_bbox = loss_bbox_dims + loss_bbox_locs + loss_bbox_oris
-
-        loss_dict = dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
-
-        return loss_dict
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.nn import functional as F
+
+from mmdet.core import multi_apply
+from mmdet.core.bbox.builder import build_bbox_coder
+from mmdet.models.utils import gaussian_radius, gen_gaussian_target
+from mmdet.models.utils.gaussian_target import (get_local_maximum,
+                                                get_topk_from_heatmap,
+                                                transpose_and_gather_feat)
+from ..builder import HEADS
+from .anchor_free_mono3d_head import AnchorFreeMono3DHead
+
+
+@HEADS.register_module()
+class SMOKEMono3DHead(AnchorFreeMono3DHead):
+    r"""Anchor-free head used in `SMOKE <https://arxiv.org/abs/2002.10111>`_
+
+    .. code-block:: none
+
+                /-----> 3*3 conv -----> 1*1 conv -----> cls
+        feature
+                \-----> 3*3 conv -----> 1*1 conv -----> reg
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        dim_channel (list[int]): indices of dimension offset preds in
+            regression heatmap channels.
+        ori_channel (list[int]): indices of orientation offset pred in
+            regression heatmap channels.
+        bbox_coder (:obj:`CameraInstance3DBoxes`): Bbox coder
+            for encoding and decoding boxes.
+        loss_cls (dict, optional): Config of classification loss.
+            Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0).
+        loss_bbox (dict, optional): Config of localization loss.
+            Default: loss_bbox=dict(type='L1Loss', loss_weight=10.0).
+        loss_dir (dict, optional): Config of direction classification loss.
+            In SMOKE, Default: None.
+        loss_attr (dict, optional): Config of attribute classification loss.
+            In SMOKE, Default: None.
+        loss_centerness (dict): Config of centerness loss.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+            Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True).
+        init_cfg (dict): Initialization config dict. Default: None.
+    """  # noqa: E501
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 dim_channel,
+                 ori_channel,
+                 bbox_coder,
+                 loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0),
+                 loss_bbox=dict(type='L1Loss', loss_weight=0.1),
+                 loss_dir=None,
+                 loss_attr=None,
+                 norm_cfg=dict(type='GN', num_groups=32, requires_grad=True),
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(
+            num_classes,
+            in_channels,
+            loss_cls=loss_cls,
+            loss_bbox=loss_bbox,
+            loss_dir=loss_dir,
+            loss_attr=loss_attr,
+            norm_cfg=norm_cfg,
+            init_cfg=init_cfg,
+            **kwargs)
+        self.dim_channel = dim_channel
+        self.ori_channel = ori_channel
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple:
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * bbox_code_size.
+        """
+        return multi_apply(self.forward_single, feats)
+
+    def forward_single(self, x):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): Input feature map.
+
+        Returns:
+            tuple: Scores for each class, bbox of input feature maps.
+        """
+        cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \
+            super().forward_single(x)
+        cls_score = cls_score.sigmoid()  # turn to 0-1
+        cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4)
+        # (N, C, H, W)
+        offset_dims = bbox_pred[:, self.dim_channel, ...]
+        bbox_pred[:, self.dim_channel, ...] = offset_dims.sigmoid() - 0.5
+        # (N, C, H, W)
+        vector_ori = bbox_pred[:, self.ori_channel, ...]
+        bbox_pred[:, self.ori_channel, ...] = F.normalize(vector_ori)
+        return cls_score, bbox_pred
+
+    def get_bboxes(self, cls_scores, bbox_preds, img_metas, rescale=None):
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+            bbox_preds (list[Tensor]): Box regression for each scale.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            rescale (bool): If True, return boxes in original image space.
+
+        Returns:
+            list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]:
+                Each item in result_list is 4-tuple.
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        cam2imgs = torch.stack([
+            cls_scores[0].new_tensor(img_meta['cam2img'])
+            for img_meta in img_metas
+        ])
+        trans_mats = torch.stack([
+            cls_scores[0].new_tensor(img_meta['trans_mat'])
+            for img_meta in img_metas
+        ])
+        batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap(
+            cls_scores[0],
+            bbox_preds[0],
+            img_metas,
+            cam2imgs=cam2imgs,
+            trans_mats=trans_mats,
+            topk=100,
+            kernel=3)
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+
+            bboxes = batch_bboxes[img_id]
+            scores = batch_scores[img_id]
+            labels = batch_topk_labels[img_id]
+
+            keep_idx = scores > 0.25
+            bboxes = bboxes[keep_idx]
+            scores = scores[keep_idx]
+            labels = labels[keep_idx]
+
+            bboxes = img_metas[img_id]['box_type_3d'](
+                bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5))
+            attrs = None
+            result_list.append((bboxes, scores, labels, attrs))
+
+        return result_list
+
+    def decode_heatmap(self,
+                       cls_score,
+                       reg_pred,
+                       img_metas,
+                       cam2imgs,
+                       trans_mats,
+                       topk=100,
+                       kernel=3):
+        """Transform outputs into detections raw bbox predictions.
+
+        Args:
+            class_score (Tensor): Center predict heatmap,
+                shape (B, num_classes, H, W).
+            reg_pred (Tensor): Box regression map.
+                shape (B, channel, H , W).
+            img_metas (List[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cam2imgs (Tensor): Camera intrinsic matrixs.
+                shape (B, 4, 4)
+            trans_mats (Tensor): Transformation matrix from original image
+                to feature map.
+                shape: (batch, 3, 3)
+            topk (int): Get top k center keypoints from heatmap. Default 100.
+            kernel (int): Max pooling kernel for extract local maximum pixels.
+               Default 3.
+
+        Returns:
+            tuple[torch.Tensor]: Decoded output of SMOKEHead, containing
+               the following Tensors:
+              - batch_bboxes (Tensor): Coords of each 3D box.
+                    shape (B, k, 7)
+              - batch_scores (Tensor): Scores of each 3D box.
+                    shape (B, k)
+              - batch_topk_labels (Tensor): Categories of each 3D box.
+                    shape (B, k)
+        """
+        img_h, img_w = img_metas[0]['pad_shape'][:2]
+        bs, _, feat_h, feat_w = cls_score.shape
+
+        center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel)
+
+        *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap(
+            center_heatmap_pred, k=topk)
+        batch_scores, batch_index, batch_topk_labels = batch_dets
+
+        regression = transpose_and_gather_feat(reg_pred, batch_index)
+        regression = regression.view(-1, 8)
+
+        points = torch.cat([topk_xs.view(-1, 1),
+                            topk_ys.view(-1, 1).float()],
+                           dim=1)
+        locations, dimensions, orientations = self.bbox_coder.decode(
+            regression, points, batch_topk_labels, cam2imgs, trans_mats)
+
+        batch_bboxes = torch.cat((locations, dimensions, orientations), dim=1)
+        batch_bboxes = batch_bboxes.view(bs, -1, self.bbox_code_size)
+        return batch_bboxes, batch_scores, batch_topk_labels
+
+    def get_predictions(self, labels3d, centers2d, gt_locations, gt_dimensions,
+                        gt_orientations, indices, img_metas, pred_reg):
+        """Prepare predictions for computing loss.
+
+        Args:
+            labels3d (Tensor): Labels of each 3D box.
+                shape (B, max_objs, )
+            centers2d (Tensor): Coords of each projected 3D box
+                center on image. shape (B * max_objs, 2)
+            gt_locations (Tensor): Coords of each 3D box's location.
+                shape (B * max_objs, 3)
+            gt_dimensions (Tensor): Dimensions of each 3D box.
+                shape (N, 3)
+            gt_orientations (Tensor): Orientation(yaw) of each 3D box.
+                shape (N, 1)
+            indices (Tensor): Indices of the existence of the 3D box.
+                shape (B * max_objs, )
+            img_metas (list[dict]): Meta information of each image,
+                e.g., image size, scaling factor, etc.
+            pre_reg (Tensor): Box regression map.
+                shape (B, channel, H , W).
+
+        Returns:
+            dict: the dict has components below:
+            - bbox3d_yaws (:obj:`CameraInstance3DBoxes`):
+                bbox calculated using pred orientations.
+            - bbox3d_dims (:obj:`CameraInstance3DBoxes`):
+                bbox calculated using pred dimensions.
+            - bbox3d_locs (:obj:`CameraInstance3DBoxes`):
+                bbox calculated using pred locations.
+        """
+        batch, channel = pred_reg.shape[0], pred_reg.shape[1]
+        w = pred_reg.shape[3]
+        cam2imgs = torch.stack([
+            gt_locations.new_tensor(img_meta['cam2img'])
+            for img_meta in img_metas
+        ])
+        trans_mats = torch.stack([
+            gt_locations.new_tensor(img_meta['trans_mat'])
+            for img_meta in img_metas
+        ])
+        centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0]
+        centers2d_inds = centers2d_inds.view(batch, -1)
+        pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds)
+        pred_regression_pois = pred_regression.view(-1, channel)
+        locations, dimensions, orientations = self.bbox_coder.decode(
+            pred_regression_pois, centers2d, labels3d, cam2imgs, trans_mats,
+            gt_locations)
+
+        locations, dimensions, orientations = locations[indices], dimensions[
+            indices], orientations[indices]
+
+        locations[:, 1] += dimensions[:, 1] / 2
+
+        gt_locations = gt_locations[indices]
+
+        assert len(locations) == len(gt_locations)
+        assert len(dimensions) == len(gt_dimensions)
+        assert len(orientations) == len(gt_orientations)
+        bbox3d_yaws = self.bbox_coder.encode(gt_locations, gt_dimensions,
+                                             orientations, img_metas)
+        bbox3d_dims = self.bbox_coder.encode(gt_locations, dimensions,
+                                             gt_orientations, img_metas)
+        bbox3d_locs = self.bbox_coder.encode(locations, gt_dimensions,
+                                             gt_orientations, img_metas)
+
+        pred_bboxes = dict(ori=bbox3d_yaws, dim=bbox3d_dims, loc=bbox3d_locs)
+
+        return pred_bboxes
+
+    def get_targets(self, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d,
+                    centers2d, feat_shape, img_shape, img_metas):
+        """Get training targets for batch images.
+
+        Args:
+            gt_bboxes (list[Tensor]): Ground truth bboxes of each image,
+                shape (num_gt, 4).
+            gt_labels (list[Tensor]): Ground truth labels of each box,
+                shape (num_gt,).
+            gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D Ground
+                truth bboxes of each image,
+                shape (num_gt, bbox_code_size).
+            gt_labels_3d (list[Tensor]): 3D Ground truth labels of each
+                box, shape (num_gt,).
+            centers2d (list[Tensor]): Projected 3D centers onto 2D image,
+                shape (num_gt, 2).
+            feat_shape (tuple[int]): Feature map shape with value,
+                shape (B, _, H, W).
+            img_shape (tuple[int]): Image shape in [h, w] format.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            tuple[Tensor, dict]: The Tensor value is the targets of
+                center heatmap, the dict has components below:
+              - gt_centers2d (Tensor): Coords of each projected 3D box
+                    center on image. shape (B * max_objs, 2)
+              - gt_labels3d (Tensor): Labels of each 3D box.
+                    shape (B, max_objs, )
+              - indices (Tensor): Indices of the existence of the 3D box.
+                    shape (B * max_objs, )
+              - affine_indices (Tensor): Indices of the affine of the 3D box.
+                    shape (N, )
+              - gt_locs (Tensor): Coords of each 3D box's location.
+                    shape (N, 3)
+              - gt_dims (Tensor): Dimensions of each 3D box.
+                    shape (N, 3)
+              - gt_yaws (Tensor): Orientation(yaw) of each 3D box.
+                    shape (N, 1)
+              - gt_cors (Tensor): Coords of the corners of each 3D box.
+                    shape (N, 8, 3)
+        """
+
+        reg_mask = torch.stack([
+            gt_bboxes[0].new_tensor(
+                not img_meta['affine_aug'], dtype=torch.bool)
+            for img_meta in img_metas
+        ])
+
+        img_h, img_w = img_shape[:2]
+        bs, _, feat_h, feat_w = feat_shape
+
+        width_ratio = float(feat_w / img_w)  # 1/4
+        height_ratio = float(feat_h / img_h)  # 1/4
+
+        assert width_ratio == height_ratio
+
+        center_heatmap_target = gt_bboxes[-1].new_zeros(
+            [bs, self.num_classes, feat_h, feat_w])
+
+        gt_centers2d = centers2d.copy()
+
+        for batch_id in range(bs):
+            gt_bbox = gt_bboxes[batch_id]
+            gt_label = gt_labels[batch_id]
+            # project centers2d from input image to feat map
+            gt_center2d = gt_centers2d[batch_id] * width_ratio
+
+            for j, center in enumerate(gt_center2d):
+                center_x_int, center_y_int = center.int()
+                scale_box_h = (gt_bbox[j][3] - gt_bbox[j][1]) * height_ratio
+                scale_box_w = (gt_bbox[j][2] - gt_bbox[j][0]) * width_ratio
+                radius = gaussian_radius([scale_box_h, scale_box_w],
+                                         min_overlap=0.7)
+                radius = max(0, int(radius))
+                ind = gt_label[j]
+                gen_gaussian_target(center_heatmap_target[batch_id, ind],
+                                    [center_x_int, center_y_int], radius)
+
+        avg_factor = max(1, center_heatmap_target.eq(1).sum())
+        num_ctrs = [center2d.shape[0] for center2d in centers2d]
+        max_objs = max(num_ctrs)
+
+        reg_inds = torch.cat(
+            [reg_mask[i].repeat(num_ctrs[i]) for i in range(bs)])
+
+        inds = torch.zeros((bs, max_objs),
+                           dtype=torch.bool).to(centers2d[0].device)
+
+        # put gt 3d bboxes to gpu
+        gt_bboxes_3d = [
+            gt_bbox_3d.to(centers2d[0].device) for gt_bbox_3d in gt_bboxes_3d
+        ]
+
+        batch_centers2d = centers2d[0].new_zeros((bs, max_objs, 2))
+        batch_labels_3d = gt_labels_3d[0].new_zeros((bs, max_objs))
+        batch_gt_locations = \
+            gt_bboxes_3d[0].tensor.new_zeros((bs, max_objs, 3))
+        for i in range(bs):
+            inds[i, :num_ctrs[i]] = 1
+            batch_centers2d[i, :num_ctrs[i]] = centers2d[i]
+            batch_labels_3d[i, :num_ctrs[i]] = gt_labels_3d[i]
+            batch_gt_locations[i, :num_ctrs[i]] = \
+                gt_bboxes_3d[i].tensor[:, :3]
+
+        inds = inds.flatten()
+        batch_centers2d = batch_centers2d.view(-1, 2) * width_ratio
+        batch_gt_locations = batch_gt_locations.view(-1, 3)
+
+        # filter the empty image, without gt_bboxes_3d
+        gt_bboxes_3d = [
+            gt_bbox_3d for gt_bbox_3d in gt_bboxes_3d
+            if gt_bbox_3d.tensor.shape[0] > 0
+        ]
+
+        gt_dimensions = torch.cat(
+            [gt_bbox_3d.tensor[:, 3:6] for gt_bbox_3d in gt_bboxes_3d])
+        gt_orientations = torch.cat([
+            gt_bbox_3d.tensor[:, 6].unsqueeze(-1)
+            for gt_bbox_3d in gt_bboxes_3d
+        ])
+        gt_corners = torch.cat(
+            [gt_bbox_3d.corners for gt_bbox_3d in gt_bboxes_3d])
+
+        target_labels = dict(
+            gt_centers2d=batch_centers2d.long(),
+            gt_labels3d=batch_labels_3d,
+            indices=inds,
+            reg_indices=reg_inds,
+            gt_locs=batch_gt_locations,
+            gt_dims=gt_dimensions,
+            gt_yaws=gt_orientations,
+            gt_cors=gt_corners)
+
+        return center_heatmap_target, avg_factor, target_labels
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             centers2d,
+             depths,
+             attr_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level.
+                shape (num_gt, 4).
+            bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel
+                number is bbox_code_size.
+                shape (B, 7, H, W).
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image.
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+                shape (num_gts, ).
+            gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground
+                truth. it is the flipped gt_bboxes
+            gt_labels_3d (list[Tensor]): Same as gt_labels.
+            centers2d (list[Tensor]): 2D centers on the image.
+                shape (num_gts, 2).
+            depths (list[Tensor]): Depth ground truth.
+                shape (num_gts, ).
+            attr_labels (list[Tensor]): Attributes indices of each box.
+                In kitti it's None.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+                Default: None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert len(cls_scores) == len(bbox_preds) == 1
+        assert attr_labels is None
+        assert gt_bboxes_ignore is None
+        center2d_heatmap = cls_scores[0]
+        pred_reg = bbox_preds[0]
+
+        center2d_heatmap_target, avg_factor, target_labels = \
+            self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d,
+                             gt_labels_3d, centers2d,
+                             center2d_heatmap.shape,
+                             img_metas[0]['pad_shape'],
+                             img_metas)
+
+        pred_bboxes = self.get_predictions(
+            labels3d=target_labels['gt_labels3d'],
+            centers2d=target_labels['gt_centers2d'],
+            gt_locations=target_labels['gt_locs'],
+            gt_dimensions=target_labels['gt_dims'],
+            gt_orientations=target_labels['gt_yaws'],
+            indices=target_labels['indices'],
+            img_metas=img_metas,
+            pred_reg=pred_reg)
+
+        loss_cls = self.loss_cls(
+            center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor)
+
+        reg_inds = target_labels['reg_indices']
+
+        loss_bbox_oris = self.loss_bbox(
+            pred_bboxes['ori'].corners[reg_inds, ...],
+            target_labels['gt_cors'][reg_inds, ...])
+
+        loss_bbox_dims = self.loss_bbox(
+            pred_bboxes['dim'].corners[reg_inds, ...],
+            target_labels['gt_cors'][reg_inds, ...])
+
+        loss_bbox_locs = self.loss_bbox(
+            pred_bboxes['loc'].corners[reg_inds, ...],
+            target_labels['gt_cors'][reg_inds, ...])
+
+        loss_bbox = loss_bbox_dims + loss_bbox_locs + loss_bbox_oris
+
+        loss_dict = dict(loss_cls=loss_cls, loss_bbox=loss_bbox)
+
+        return loss_dict
diff --git a/mmdet3d/models/dense_heads/ssd_3d_head.py b/mmdet3d/models/dense_heads/ssd_3d_head.py
index c20c4b1..5a6df11 100644
--- a/mmdet3d/models/dense_heads/ssd_3d_head.py
+++ b/mmdet3d/models/dense_heads/ssd_3d_head.py
@@ -1,557 +1,557 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.ops.nms import batched_nms
-from mmcv.runner import force_fp32
-from torch.nn import functional as F
-
-from mmdet3d.core.bbox.structures import (DepthInstance3DBoxes,
-                                          LiDARInstance3DBoxes,
-                                          rotation_3d_in_axis)
-from mmdet.core import multi_apply
-from ..builder import HEADS, build_loss
-from .vote_head import VoteHead
-
-
-@HEADS.register_module()
-class SSD3DHead(VoteHead):
-    r"""Bbox head of `3DSSD <https://arxiv.org/abs/2002.10187>`_.
-
-    Args:
-        num_classes (int): The number of class.
-        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
-            decoding boxes.
-        in_channels (int): The number of input feature channel.
-        train_cfg (dict): Config for training.
-        test_cfg (dict): Config for testing.
-        vote_module_cfg (dict): Config of VoteModule for point-wise votes.
-        vote_aggregation_cfg (dict): Config of vote aggregation layer.
-        pred_layer_cfg (dict): Config of classfication and regression
-            prediction layers.
-        conv_cfg (dict): Config of convolution in prediction layer.
-        norm_cfg (dict): Config of BN in prediction layer.
-        act_cfg (dict): Config of activation in prediction layer.
-        objectness_loss (dict): Config of objectness loss.
-        center_loss (dict): Config of center loss.
-        dir_class_loss (dict): Config of direction classification loss.
-        dir_res_loss (dict): Config of direction residual regression loss.
-        size_res_loss (dict): Config of size residual regression loss.
-        corner_loss (dict): Config of bbox corners regression loss.
-        vote_loss (dict): Config of candidate points regression loss.
-    """
-
-    def __init__(self,
-                 num_classes,
-                 bbox_coder,
-                 in_channels=256,
-                 train_cfg=None,
-                 test_cfg=None,
-                 vote_module_cfg=None,
-                 vote_aggregation_cfg=None,
-                 pred_layer_cfg=None,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 objectness_loss=None,
-                 center_loss=None,
-                 dir_class_loss=None,
-                 dir_res_loss=None,
-                 size_res_loss=None,
-                 corner_loss=None,
-                 vote_loss=None,
-                 init_cfg=None):
-        super(SSD3DHead, self).__init__(
-            num_classes,
-            bbox_coder,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            vote_module_cfg=vote_module_cfg,
-            vote_aggregation_cfg=vote_aggregation_cfg,
-            pred_layer_cfg=pred_layer_cfg,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            objectness_loss=objectness_loss,
-            center_loss=center_loss,
-            dir_class_loss=dir_class_loss,
-            dir_res_loss=dir_res_loss,
-            size_class_loss=None,
-            size_res_loss=size_res_loss,
-            semantic_loss=None,
-            init_cfg=init_cfg)
-
-        self.corner_loss = build_loss(corner_loss)
-        self.vote_loss = build_loss(vote_loss)
-        self.num_candidates = vote_module_cfg['num_points']
-
-    def _get_cls_out_channels(self):
-        """Return the channel number of classification outputs."""
-        # Class numbers (k) + objectness (1)
-        return self.num_classes
-
-    def _get_reg_out_channels(self):
-        """Return the channel number of regression outputs."""
-        # Bbox classification and regression
-        # (center residual (3), size regression (3)
-        # heading class+residual (num_dir_bins*2)),
-        return 3 + 3 + self.num_dir_bins * 2
-
-    def _extract_input(self, feat_dict):
-        """Extract inputs from features dictionary.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-
-        Returns:
-            torch.Tensor: Coordinates of input points.
-            torch.Tensor: Features of input points.
-            torch.Tensor: Indices of input points.
-        """
-        seed_points = feat_dict['sa_xyz'][-1]
-        seed_features = feat_dict['sa_features'][-1]
-        seed_indices = feat_dict['sa_indices'][-1]
-
-        return seed_points, seed_features, seed_indices
-
-    @force_fp32(apply_to=('bbox_preds', ))
-    def loss(self,
-             bbox_preds,
-             points,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             pts_semantic_mask=None,
-             pts_instance_mask=None,
-             img_metas=None,
-             gt_bboxes_ignore=None):
-        """Compute loss.
-
-        Args:
-            bbox_preds (dict): Predictions from forward of SSD3DHead.
-            points (list[torch.Tensor]): Input points.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each sample.
-            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
-            pts_semantic_mask (list[torch.Tensor]): Point-wise
-                semantic mask.
-            pts_instance_mask (list[torch.Tensor]): Point-wise
-                instance mask.
-            img_metas (list[dict]): Contain pcd and img's meta info.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding.
-
-        Returns:
-            dict: Losses of 3DSSD.
-        """
-        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
-                                   pts_semantic_mask, pts_instance_mask,
-                                   bbox_preds)
-        (vote_targets, center_targets, size_res_targets, dir_class_targets,
-         dir_res_targets, mask_targets, centerness_targets, corner3d_targets,
-         vote_mask, positive_mask, negative_mask, centerness_weights,
-         box_loss_weights, heading_res_loss_weight) = targets
-
-        # calculate centerness loss
-        centerness_loss = self.objectness_loss(
-            bbox_preds['obj_scores'].transpose(2, 1),
-            centerness_targets,
-            weight=centerness_weights)
-
-        # calculate center loss
-        center_loss = self.center_loss(
-            bbox_preds['center_offset'],
-            center_targets,
-            weight=box_loss_weights.unsqueeze(-1))
-
-        # calculate direction class loss
-        dir_class_loss = self.dir_class_loss(
-            bbox_preds['dir_class'].transpose(1, 2),
-            dir_class_targets,
-            weight=box_loss_weights)
-
-        # calculate direction residual loss
-        dir_res_loss = self.dir_res_loss(
-            bbox_preds['dir_res_norm'],
-            dir_res_targets.unsqueeze(-1).repeat(1, 1, self.num_dir_bins),
-            weight=heading_res_loss_weight)
-
-        # calculate size residual loss
-        size_loss = self.size_res_loss(
-            bbox_preds['size'],
-            size_res_targets,
-            weight=box_loss_weights.unsqueeze(-1))
-
-        # calculate corner loss
-        one_hot_dir_class_targets = dir_class_targets.new_zeros(
-            bbox_preds['dir_class'].shape)
-        one_hot_dir_class_targets.scatter_(2, dir_class_targets.unsqueeze(-1),
-                                           1)
-        pred_bbox3d = self.bbox_coder.decode(
-            dict(
-                center=bbox_preds['center'],
-                dir_res=bbox_preds['dir_res'],
-                dir_class=one_hot_dir_class_targets,
-                size=bbox_preds['size']))
-        pred_bbox3d = pred_bbox3d.reshape(-1, pred_bbox3d.shape[-1])
-        pred_bbox3d = img_metas[0]['box_type_3d'](
-            pred_bbox3d.clone(),
-            box_dim=pred_bbox3d.shape[-1],
-            with_yaw=self.bbox_coder.with_rot,
-            origin=(0.5, 0.5, 0.5))
-        pred_corners3d = pred_bbox3d.corners.reshape(-1, 8, 3)
-        corner_loss = self.corner_loss(
-            pred_corners3d,
-            corner3d_targets.reshape(-1, 8, 3),
-            weight=box_loss_weights.view(-1, 1, 1))
-
-        # calculate vote loss
-        vote_loss = self.vote_loss(
-            bbox_preds['vote_offset'].transpose(1, 2),
-            vote_targets,
-            weight=vote_mask.unsqueeze(-1))
-
-        losses = dict(
-            centerness_loss=centerness_loss,
-            center_loss=center_loss,
-            dir_class_loss=dir_class_loss,
-            dir_res_loss=dir_res_loss,
-            size_res_loss=size_loss,
-            corner_loss=corner_loss,
-            vote_loss=vote_loss)
-
-        return losses
-
-    def get_targets(self,
-                    points,
-                    gt_bboxes_3d,
-                    gt_labels_3d,
-                    pts_semantic_mask=None,
-                    pts_instance_mask=None,
-                    bbox_preds=None):
-        """Generate targets of ssd3d head.
-
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
-            pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
-                label of each batch.
-            pts_instance_mask (list[torch.Tensor]): Point-wise instance
-                label of each batch.
-            bbox_preds (torch.Tensor): Bounding box predictions of ssd3d head.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of ssd3d head.
-        """
-        # find empty example
-        for index in range(len(gt_labels_3d)):
-            if len(gt_labels_3d[index]) == 0:
-                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
-                    1, gt_bboxes_3d[index].tensor.shape[-1])
-                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
-                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
-
-        if pts_semantic_mask is None:
-            pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
-            pts_instance_mask = [None for i in range(len(gt_labels_3d))]
-
-        aggregated_points = [
-            bbox_preds['aggregated_points'][i]
-            for i in range(len(gt_labels_3d))
-        ]
-
-        seed_points = [
-            bbox_preds['seed_points'][i, :self.num_candidates].detach()
-            for i in range(len(gt_labels_3d))
-        ]
-
-        (vote_targets, center_targets, size_res_targets, dir_class_targets,
-         dir_res_targets, mask_targets, centerness_targets, corner3d_targets,
-         vote_mask, positive_mask, negative_mask) = multi_apply(
-             self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d,
-             pts_semantic_mask, pts_instance_mask, aggregated_points,
-             seed_points)
-
-        center_targets = torch.stack(center_targets)
-        positive_mask = torch.stack(positive_mask)
-        negative_mask = torch.stack(negative_mask)
-        dir_class_targets = torch.stack(dir_class_targets)
-        dir_res_targets = torch.stack(dir_res_targets)
-        size_res_targets = torch.stack(size_res_targets)
-        mask_targets = torch.stack(mask_targets)
-        centerness_targets = torch.stack(centerness_targets).detach()
-        corner3d_targets = torch.stack(corner3d_targets)
-        vote_targets = torch.stack(vote_targets)
-        vote_mask = torch.stack(vote_mask)
-
-        center_targets -= bbox_preds['aggregated_points']
-
-        centerness_weights = (positive_mask +
-                              negative_mask).unsqueeze(-1).repeat(
-                                  1, 1, self.num_classes).float()
-        centerness_weights = centerness_weights / \
-            (centerness_weights.sum() + 1e-6)
-        vote_mask = vote_mask / (vote_mask.sum() + 1e-6)
-
-        box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6)
-
-        batch_size, proposal_num = dir_class_targets.shape[:2]
-        heading_label_one_hot = dir_class_targets.new_zeros(
-            (batch_size, proposal_num, self.num_dir_bins))
-        heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
-        heading_res_loss_weight = heading_label_one_hot * \
-            box_loss_weights.unsqueeze(-1)
-
-        return (vote_targets, center_targets, size_res_targets,
-                dir_class_targets, dir_res_targets, mask_targets,
-                centerness_targets, corner3d_targets, vote_mask, positive_mask,
-                negative_mask, centerness_weights, box_loss_weights,
-                heading_res_loss_weight)
-
-    def get_targets_single(self,
-                           points,
-                           gt_bboxes_3d,
-                           gt_labels_3d,
-                           pts_semantic_mask=None,
-                           pts_instance_mask=None,
-                           aggregated_points=None,
-                           seed_points=None):
-        """Generate targets of ssd3d head for single batch.
-
-        Args:
-            points (torch.Tensor): Points of each batch.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
-                boxes of each batch.
-            gt_labels_3d (torch.Tensor): Labels of each batch.
-            pts_semantic_mask (torch.Tensor): Point-wise semantic
-                label of each batch.
-            pts_instance_mask (torch.Tensor): Point-wise instance
-                label of each batch.
-            aggregated_points (torch.Tensor): Aggregated points from
-                candidate points layer.
-            seed_points (torch.Tensor): Seed points of candidate points.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of ssd3d head.
-        """
-        assert self.bbox_coder.with_rot or pts_semantic_mask is not None
-        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
-        valid_gt = gt_labels_3d != -1
-        gt_bboxes_3d = gt_bboxes_3d[valid_gt]
-        gt_labels_3d = gt_labels_3d[valid_gt]
-
-        # Generate fake GT for empty scene
-        if valid_gt.sum() == 0:
-            vote_targets = points.new_zeros(self.num_candidates, 3)
-            center_targets = points.new_zeros(self.num_candidates, 3)
-            size_res_targets = points.new_zeros(self.num_candidates, 3)
-            dir_class_targets = points.new_zeros(
-                self.num_candidates, dtype=torch.int64)
-            dir_res_targets = points.new_zeros(self.num_candidates)
-            mask_targets = points.new_zeros(
-                self.num_candidates, dtype=torch.int64)
-            centerness_targets = points.new_zeros(self.num_candidates,
-                                                  self.num_classes)
-            corner3d_targets = points.new_zeros(self.num_candidates, 8, 3)
-            vote_mask = points.new_zeros(self.num_candidates, dtype=torch.bool)
-            positive_mask = points.new_zeros(
-                self.num_candidates, dtype=torch.bool)
-            negative_mask = points.new_ones(
-                self.num_candidates, dtype=torch.bool)
-            return (vote_targets, center_targets, size_res_targets,
-                    dir_class_targets, dir_res_targets, mask_targets,
-                    centerness_targets, corner3d_targets, vote_mask,
-                    positive_mask, negative_mask)
-
-        gt_corner3d = gt_bboxes_3d.corners
-
-        (center_targets, size_targets, dir_class_targets,
-         dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
-
-        points_mask, assignment = self._assign_targets_by_points_inside(
-            gt_bboxes_3d, aggregated_points)
-
-        center_targets = center_targets[assignment]
-        size_res_targets = size_targets[assignment]
-        mask_targets = gt_labels_3d[assignment]
-        dir_class_targets = dir_class_targets[assignment]
-        dir_res_targets = dir_res_targets[assignment]
-        corner3d_targets = gt_corner3d[assignment]
-
-        top_center_targets = center_targets.clone()
-        top_center_targets[:, 2] += size_res_targets[:, 2]
-        dist = torch.norm(aggregated_points - top_center_targets, dim=1)
-        dist_mask = dist < self.train_cfg.pos_distance_thr
-        positive_mask = (points_mask.max(1)[0] > 0) * dist_mask
-        negative_mask = (points_mask.max(1)[0] == 0)
-
-        # Centerness loss targets
-        canonical_xyz = aggregated_points - center_targets
-        if self.bbox_coder.with_rot:
-            # TODO: Align points rotation implementation of
-            # LiDARInstance3DBoxes and DepthInstance3DBoxes
-            canonical_xyz = rotation_3d_in_axis(
-                canonical_xyz.unsqueeze(0).transpose(0, 1),
-                -gt_bboxes_3d.yaw[assignment],
-                axis=2).squeeze(1)
-        distance_front = torch.clamp(
-            size_res_targets[:, 0] - canonical_xyz[:, 0], min=0)
-        distance_back = torch.clamp(
-            size_res_targets[:, 0] + canonical_xyz[:, 0], min=0)
-        distance_left = torch.clamp(
-            size_res_targets[:, 1] - canonical_xyz[:, 1], min=0)
-        distance_right = torch.clamp(
-            size_res_targets[:, 1] + canonical_xyz[:, 1], min=0)
-        distance_top = torch.clamp(
-            size_res_targets[:, 2] - canonical_xyz[:, 2], min=0)
-        distance_bottom = torch.clamp(
-            size_res_targets[:, 2] + canonical_xyz[:, 2], min=0)
-
-        centerness_l = torch.min(distance_front, distance_back) / torch.max(
-            distance_front, distance_back)
-        centerness_w = torch.min(distance_left, distance_right) / torch.max(
-            distance_left, distance_right)
-        centerness_h = torch.min(distance_bottom, distance_top) / torch.max(
-            distance_bottom, distance_top)
-        centerness_targets = torch.clamp(
-            centerness_l * centerness_w * centerness_h, min=0)
-        centerness_targets = centerness_targets.pow(1 / 3.0)
-        centerness_targets = torch.clamp(centerness_targets, min=0, max=1)
-
-        proposal_num = centerness_targets.shape[0]
-        one_hot_centerness_targets = centerness_targets.new_zeros(
-            (proposal_num, self.num_classes))
-        one_hot_centerness_targets.scatter_(1, mask_targets.unsqueeze(-1), 1)
-        centerness_targets = centerness_targets.unsqueeze(
-            1) * one_hot_centerness_targets
-
-        # Vote loss targets
-        enlarged_gt_bboxes_3d = gt_bboxes_3d.enlarged_box(
-            self.train_cfg.expand_dims_length)
-        enlarged_gt_bboxes_3d.tensor[:, 2] -= self.train_cfg.expand_dims_length
-        vote_mask, vote_assignment = self._assign_targets_by_points_inside(
-            enlarged_gt_bboxes_3d, seed_points)
-
-        vote_targets = gt_bboxes_3d.gravity_center
-        vote_targets = vote_targets[vote_assignment] - seed_points
-        vote_mask = vote_mask.max(1)[0] > 0
-
-        return (vote_targets, center_targets, size_res_targets,
-                dir_class_targets, dir_res_targets, mask_targets,
-                centerness_targets, corner3d_targets, vote_mask, positive_mask,
-                negative_mask)
-
-    def get_bboxes(self, points, bbox_preds, input_metas, rescale=False):
-        """Generate bboxes from 3DSSD head predictions.
-
-        Args:
-            points (torch.Tensor): Input points.
-            bbox_preds (dict): Predictions from sdd3d head.
-            input_metas (list[dict]): Point cloud and image's meta info.
-            rescale (bool): Whether to rescale bboxes.
-
-        Returns:
-            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
-        """
-        # decode boxes
-        sem_scores = F.sigmoid(bbox_preds['obj_scores']).transpose(1, 2)
-        obj_scores = sem_scores.max(-1)[0]
-        bbox3d = self.bbox_coder.decode(bbox_preds)
-
-        batch_size = bbox3d.shape[0]
-        results = list()
-
-        for b in range(batch_size):
-            bbox_selected, score_selected, labels = self.multiclass_nms_single(
-                obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3],
-                input_metas[b])
-
-            bbox = input_metas[b]['box_type_3d'](
-                bbox_selected.clone(),
-                box_dim=bbox_selected.shape[-1],
-                with_yaw=self.bbox_coder.with_rot)
-            results.append((bbox, score_selected, labels))
-
-        return results
-
-    def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
-                              input_meta):
-        """Multi-class nms in single batch.
-
-        Args:
-            obj_scores (torch.Tensor): Objectness score of bounding boxes.
-            sem_scores (torch.Tensor): Semantic class score of bounding boxes.
-            bbox (torch.Tensor): Predicted bounding boxes.
-            points (torch.Tensor): Input points.
-            input_meta (dict): Point cloud and image's meta info.
-
-        Returns:
-            tuple[torch.Tensor]: Bounding boxes, scores and labels.
-        """
-        bbox = input_meta['box_type_3d'](
-            bbox.clone(),
-            box_dim=bbox.shape[-1],
-            with_yaw=self.bbox_coder.with_rot,
-            origin=(0.5, 0.5, 0.5))
-
-        if isinstance(bbox, (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
-            box_indices = bbox.points_in_boxes_all(points)
-            nonempty_box_mask = box_indices.T.sum(1) >= 0
-        else:
-            raise NotImplementedError('Unsupported bbox type!')
-
-        corner3d = bbox.corners
-        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
-        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
-        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
-
-        bbox_classes = torch.argmax(sem_scores, -1)
-        nms_keep = batched_nms(
-            minmax_box3d[nonempty_box_mask][:, [0, 1, 3, 4]],
-            obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask],
-            self.test_cfg.nms_cfg)[1]
-
-        if nms_keep.shape[0] > self.test_cfg.max_output_num:
-            nms_keep = nms_keep[:self.test_cfg.max_output_num]
-
-        # filter empty boxes and boxes with low score
-        scores_mask = (obj_scores >= self.test_cfg.score_thr)
-        nonempty_box_inds = torch.nonzero(
-            nonempty_box_mask, as_tuple=False).flatten()
-        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
-            0, nonempty_box_inds[nms_keep], 1)
-        selected = (nonempty_mask.bool() & scores_mask.bool())
-
-        if self.test_cfg.per_class_proposal:
-            bbox_selected, score_selected, labels = [], [], []
-            for k in range(sem_scores.shape[-1]):
-                bbox_selected.append(bbox[selected].tensor)
-                score_selected.append(obj_scores[selected])
-                labels.append(
-                    torch.zeros_like(bbox_classes[selected]).fill_(k))
-            bbox_selected = torch.cat(bbox_selected, 0)
-            score_selected = torch.cat(score_selected, 0)
-            labels = torch.cat(labels, 0)
-        else:
-            bbox_selected = bbox[selected].tensor
-            score_selected = obj_scores[selected]
-            labels = bbox_classes[selected]
-
-        return bbox_selected, score_selected, labels
-
-    def _assign_targets_by_points_inside(self, bboxes_3d, points):
-        """Compute assignment by checking whether point is inside bbox.
-
-        Args:
-            bboxes_3d (BaseInstance3DBoxes): Instance of bounding boxes.
-            points (torch.Tensor): Points of a batch.
-
-        Returns:
-            tuple[torch.Tensor]: Flags indicating whether each point is
-                inside bbox and the index of box where each point are in.
-        """
-        if isinstance(bboxes_3d, (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
-            points_mask = bboxes_3d.points_in_boxes_all(points)
-            assignment = points_mask.argmax(dim=-1)
-        else:
-            raise NotImplementedError('Unsupported bbox type!')
-
-        return points_mask, assignment
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops.nms import batched_nms
+from mmcv.runner import force_fp32
+from torch.nn import functional as F
+
+from mmdet3d.core.bbox.structures import (DepthInstance3DBoxes,
+                                          LiDARInstance3DBoxes,
+                                          rotation_3d_in_axis)
+from mmdet.core import multi_apply
+from ..builder import HEADS, build_loss
+from .vote_head import VoteHead
+
+
+@HEADS.register_module()
+class SSD3DHead(VoteHead):
+    r"""Bbox head of `3DSSD <https://arxiv.org/abs/2002.10187>`_.
+
+    Args:
+        num_classes (int): The number of class.
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
+            decoding boxes.
+        in_channels (int): The number of input feature channel.
+        train_cfg (dict): Config for training.
+        test_cfg (dict): Config for testing.
+        vote_module_cfg (dict): Config of VoteModule for point-wise votes.
+        vote_aggregation_cfg (dict): Config of vote aggregation layer.
+        pred_layer_cfg (dict): Config of classfication and regression
+            prediction layers.
+        conv_cfg (dict): Config of convolution in prediction layer.
+        norm_cfg (dict): Config of BN in prediction layer.
+        act_cfg (dict): Config of activation in prediction layer.
+        objectness_loss (dict): Config of objectness loss.
+        center_loss (dict): Config of center loss.
+        dir_class_loss (dict): Config of direction classification loss.
+        dir_res_loss (dict): Config of direction residual regression loss.
+        size_res_loss (dict): Config of size residual regression loss.
+        corner_loss (dict): Config of bbox corners regression loss.
+        vote_loss (dict): Config of candidate points regression loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 bbox_coder,
+                 in_channels=256,
+                 train_cfg=None,
+                 test_cfg=None,
+                 vote_module_cfg=None,
+                 vote_aggregation_cfg=None,
+                 pred_layer_cfg=None,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 act_cfg=dict(type='ReLU'),
+                 objectness_loss=None,
+                 center_loss=None,
+                 dir_class_loss=None,
+                 dir_res_loss=None,
+                 size_res_loss=None,
+                 corner_loss=None,
+                 vote_loss=None,
+                 init_cfg=None):
+        super(SSD3DHead, self).__init__(
+            num_classes,
+            bbox_coder,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            vote_module_cfg=vote_module_cfg,
+            vote_aggregation_cfg=vote_aggregation_cfg,
+            pred_layer_cfg=pred_layer_cfg,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            objectness_loss=objectness_loss,
+            center_loss=center_loss,
+            dir_class_loss=dir_class_loss,
+            dir_res_loss=dir_res_loss,
+            size_class_loss=None,
+            size_res_loss=size_res_loss,
+            semantic_loss=None,
+            init_cfg=init_cfg)
+
+        self.corner_loss = build_loss(corner_loss)
+        self.vote_loss = build_loss(vote_loss)
+        self.num_candidates = vote_module_cfg['num_points']
+
+    def _get_cls_out_channels(self):
+        """Return the channel number of classification outputs."""
+        # Class numbers (k) + objectness (1)
+        return self.num_classes
+
+    def _get_reg_out_channels(self):
+        """Return the channel number of regression outputs."""
+        # Bbox classification and regression
+        # (center residual (3), size regression (3)
+        # heading class+residual (num_dir_bins*2)),
+        return 3 + 3 + self.num_dir_bins * 2
+
+    def _extract_input(self, feat_dict):
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: Coordinates of input points.
+            torch.Tensor: Features of input points.
+            torch.Tensor: Indices of input points.
+        """
+        seed_points = feat_dict['sa_xyz'][-1]
+        seed_features = feat_dict['sa_features'][-1]
+        seed_indices = feat_dict['sa_indices'][-1]
+
+        return seed_points, seed_features, seed_indices
+
+    @force_fp32(apply_to=('bbox_preds', ))
+    def loss(self,
+             bbox_preds,
+             points,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             pts_semantic_mask=None,
+             pts_instance_mask=None,
+             img_metas=None,
+             gt_bboxes_ignore=None):
+        """Compute loss.
+
+        Args:
+            bbox_preds (dict): Predictions from forward of SSD3DHead.
+            points (list[torch.Tensor]): Input points.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each sample.
+            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
+            pts_semantic_mask (list[torch.Tensor]): Point-wise
+                semantic mask.
+            pts_instance_mask (list[torch.Tensor]): Point-wise
+                instance mask.
+            img_metas (list[dict]): Contain pcd and img's meta info.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding.
+
+        Returns:
+            dict: Losses of 3DSSD.
+        """
+        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
+                                   pts_semantic_mask, pts_instance_mask,
+                                   bbox_preds)
+        (vote_targets, center_targets, size_res_targets, dir_class_targets,
+         dir_res_targets, mask_targets, centerness_targets, corner3d_targets,
+         vote_mask, positive_mask, negative_mask, centerness_weights,
+         box_loss_weights, heading_res_loss_weight) = targets
+
+        # calculate centerness loss
+        centerness_loss = self.objectness_loss(
+            bbox_preds['obj_scores'].transpose(2, 1),
+            centerness_targets,
+            weight=centerness_weights)
+
+        # calculate center loss
+        center_loss = self.center_loss(
+            bbox_preds['center_offset'],
+            center_targets,
+            weight=box_loss_weights.unsqueeze(-1))
+
+        # calculate direction class loss
+        dir_class_loss = self.dir_class_loss(
+            bbox_preds['dir_class'].transpose(1, 2),
+            dir_class_targets,
+            weight=box_loss_weights)
+
+        # calculate direction residual loss
+        dir_res_loss = self.dir_res_loss(
+            bbox_preds['dir_res_norm'],
+            dir_res_targets.unsqueeze(-1).repeat(1, 1, self.num_dir_bins),
+            weight=heading_res_loss_weight)
+
+        # calculate size residual loss
+        size_loss = self.size_res_loss(
+            bbox_preds['size'],
+            size_res_targets,
+            weight=box_loss_weights.unsqueeze(-1))
+
+        # calculate corner loss
+        one_hot_dir_class_targets = dir_class_targets.new_zeros(
+            bbox_preds['dir_class'].shape)
+        one_hot_dir_class_targets.scatter_(2, dir_class_targets.unsqueeze(-1),
+                                           1)
+        pred_bbox3d = self.bbox_coder.decode(
+            dict(
+                center=bbox_preds['center'],
+                dir_res=bbox_preds['dir_res'],
+                dir_class=one_hot_dir_class_targets,
+                size=bbox_preds['size']))
+        pred_bbox3d = pred_bbox3d.reshape(-1, pred_bbox3d.shape[-1])
+        pred_bbox3d = img_metas[0]['box_type_3d'](
+            pred_bbox3d.clone(),
+            box_dim=pred_bbox3d.shape[-1],
+            with_yaw=self.bbox_coder.with_rot,
+            origin=(0.5, 0.5, 0.5))
+        pred_corners3d = pred_bbox3d.corners.reshape(-1, 8, 3)
+        corner_loss = self.corner_loss(
+            pred_corners3d,
+            corner3d_targets.reshape(-1, 8, 3),
+            weight=box_loss_weights.view(-1, 1, 1))
+
+        # calculate vote loss
+        vote_loss = self.vote_loss(
+            bbox_preds['vote_offset'].transpose(1, 2),
+            vote_targets,
+            weight=vote_mask.unsqueeze(-1))
+
+        losses = dict(
+            centerness_loss=centerness_loss,
+            center_loss=center_loss,
+            dir_class_loss=dir_class_loss,
+            dir_res_loss=dir_res_loss,
+            size_res_loss=size_loss,
+            corner_loss=corner_loss,
+            vote_loss=vote_loss)
+
+        return losses
+
+    def get_targets(self,
+                    points,
+                    gt_bboxes_3d,
+                    gt_labels_3d,
+                    pts_semantic_mask=None,
+                    pts_instance_mask=None,
+                    bbox_preds=None):
+        """Generate targets of ssd3d head.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each batch.
+            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
+            pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (list[torch.Tensor]): Point-wise instance
+                label of each batch.
+            bbox_preds (torch.Tensor): Bounding box predictions of ssd3d head.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of ssd3d head.
+        """
+        # find empty example
+        for index in range(len(gt_labels_3d)):
+            if len(gt_labels_3d[index]) == 0:
+                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
+                    1, gt_bboxes_3d[index].tensor.shape[-1])
+                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
+                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
+
+        if pts_semantic_mask is None:
+            pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
+            pts_instance_mask = [None for i in range(len(gt_labels_3d))]
+
+        aggregated_points = [
+            bbox_preds['aggregated_points'][i]
+            for i in range(len(gt_labels_3d))
+        ]
+
+        seed_points = [
+            bbox_preds['seed_points'][i, :self.num_candidates].detach()
+            for i in range(len(gt_labels_3d))
+        ]
+
+        (vote_targets, center_targets, size_res_targets, dir_class_targets,
+         dir_res_targets, mask_targets, centerness_targets, corner3d_targets,
+         vote_mask, positive_mask, negative_mask) = multi_apply(
+             self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d,
+             pts_semantic_mask, pts_instance_mask, aggregated_points,
+             seed_points)
+
+        center_targets = torch.stack(center_targets)
+        positive_mask = torch.stack(positive_mask)
+        negative_mask = torch.stack(negative_mask)
+        dir_class_targets = torch.stack(dir_class_targets)
+        dir_res_targets = torch.stack(dir_res_targets)
+        size_res_targets = torch.stack(size_res_targets)
+        mask_targets = torch.stack(mask_targets)
+        centerness_targets = torch.stack(centerness_targets).detach()
+        corner3d_targets = torch.stack(corner3d_targets)
+        vote_targets = torch.stack(vote_targets)
+        vote_mask = torch.stack(vote_mask)
+
+        center_targets -= bbox_preds['aggregated_points']
+
+        centerness_weights = (positive_mask +
+                              negative_mask).unsqueeze(-1).repeat(
+                                  1, 1, self.num_classes).float()
+        centerness_weights = centerness_weights / \
+            (centerness_weights.sum() + 1e-6)
+        vote_mask = vote_mask / (vote_mask.sum() + 1e-6)
+
+        box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6)
+
+        batch_size, proposal_num = dir_class_targets.shape[:2]
+        heading_label_one_hot = dir_class_targets.new_zeros(
+            (batch_size, proposal_num, self.num_dir_bins))
+        heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
+        heading_res_loss_weight = heading_label_one_hot * \
+            box_loss_weights.unsqueeze(-1)
+
+        return (vote_targets, center_targets, size_res_targets,
+                dir_class_targets, dir_res_targets, mask_targets,
+                centerness_targets, corner3d_targets, vote_mask, positive_mask,
+                negative_mask, centerness_weights, box_loss_weights,
+                heading_res_loss_weight)
+
+    def get_targets_single(self,
+                           points,
+                           gt_bboxes_3d,
+                           gt_labels_3d,
+                           pts_semantic_mask=None,
+                           pts_instance_mask=None,
+                           aggregated_points=None,
+                           seed_points=None):
+        """Generate targets of ssd3d head for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+            pts_semantic_mask (torch.Tensor): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (torch.Tensor): Point-wise instance
+                label of each batch.
+            aggregated_points (torch.Tensor): Aggregated points from
+                candidate points layer.
+            seed_points (torch.Tensor): Seed points of candidate points.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of ssd3d head.
+        """
+        assert self.bbox_coder.with_rot or pts_semantic_mask is not None
+        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
+        valid_gt = gt_labels_3d != -1
+        gt_bboxes_3d = gt_bboxes_3d[valid_gt]
+        gt_labels_3d = gt_labels_3d[valid_gt]
+
+        # Generate fake GT for empty scene
+        if valid_gt.sum() == 0:
+            vote_targets = points.new_zeros(self.num_candidates, 3)
+            center_targets = points.new_zeros(self.num_candidates, 3)
+            size_res_targets = points.new_zeros(self.num_candidates, 3)
+            dir_class_targets = points.new_zeros(
+                self.num_candidates, dtype=torch.int64)
+            dir_res_targets = points.new_zeros(self.num_candidates)
+            mask_targets = points.new_zeros(
+                self.num_candidates, dtype=torch.int64)
+            centerness_targets = points.new_zeros(self.num_candidates,
+                                                  self.num_classes)
+            corner3d_targets = points.new_zeros(self.num_candidates, 8, 3)
+            vote_mask = points.new_zeros(self.num_candidates, dtype=torch.bool)
+            positive_mask = points.new_zeros(
+                self.num_candidates, dtype=torch.bool)
+            negative_mask = points.new_ones(
+                self.num_candidates, dtype=torch.bool)
+            return (vote_targets, center_targets, size_res_targets,
+                    dir_class_targets, dir_res_targets, mask_targets,
+                    centerness_targets, corner3d_targets, vote_mask,
+                    positive_mask, negative_mask)
+
+        gt_corner3d = gt_bboxes_3d.corners
+
+        (center_targets, size_targets, dir_class_targets,
+         dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
+
+        points_mask, assignment = self._assign_targets_by_points_inside(
+            gt_bboxes_3d, aggregated_points)
+
+        center_targets = center_targets[assignment]
+        size_res_targets = size_targets[assignment]
+        mask_targets = gt_labels_3d[assignment]
+        dir_class_targets = dir_class_targets[assignment]
+        dir_res_targets = dir_res_targets[assignment]
+        corner3d_targets = gt_corner3d[assignment]
+
+        top_center_targets = center_targets.clone()
+        top_center_targets[:, 2] += size_res_targets[:, 2]
+        dist = torch.norm(aggregated_points - top_center_targets, dim=1)
+        dist_mask = dist < self.train_cfg.pos_distance_thr
+        positive_mask = (points_mask.max(1)[0] > 0) * dist_mask
+        negative_mask = (points_mask.max(1)[0] == 0)
+
+        # Centerness loss targets
+        canonical_xyz = aggregated_points - center_targets
+        if self.bbox_coder.with_rot:
+            # TODO: Align points rotation implementation of
+            # LiDARInstance3DBoxes and DepthInstance3DBoxes
+            canonical_xyz = rotation_3d_in_axis(
+                canonical_xyz.unsqueeze(0).transpose(0, 1),
+                -gt_bboxes_3d.yaw[assignment],
+                axis=2).squeeze(1)
+        distance_front = torch.clamp(
+            size_res_targets[:, 0] - canonical_xyz[:, 0], min=0)
+        distance_back = torch.clamp(
+            size_res_targets[:, 0] + canonical_xyz[:, 0], min=0)
+        distance_left = torch.clamp(
+            size_res_targets[:, 1] - canonical_xyz[:, 1], min=0)
+        distance_right = torch.clamp(
+            size_res_targets[:, 1] + canonical_xyz[:, 1], min=0)
+        distance_top = torch.clamp(
+            size_res_targets[:, 2] - canonical_xyz[:, 2], min=0)
+        distance_bottom = torch.clamp(
+            size_res_targets[:, 2] + canonical_xyz[:, 2], min=0)
+
+        centerness_l = torch.min(distance_front, distance_back) / torch.max(
+            distance_front, distance_back)
+        centerness_w = torch.min(distance_left, distance_right) / torch.max(
+            distance_left, distance_right)
+        centerness_h = torch.min(distance_bottom, distance_top) / torch.max(
+            distance_bottom, distance_top)
+        centerness_targets = torch.clamp(
+            centerness_l * centerness_w * centerness_h, min=0)
+        centerness_targets = centerness_targets.pow(1 / 3.0)
+        centerness_targets = torch.clamp(centerness_targets, min=0, max=1)
+
+        proposal_num = centerness_targets.shape[0]
+        one_hot_centerness_targets = centerness_targets.new_zeros(
+            (proposal_num, self.num_classes))
+        one_hot_centerness_targets.scatter_(1, mask_targets.unsqueeze(-1), 1)
+        centerness_targets = centerness_targets.unsqueeze(
+            1) * one_hot_centerness_targets
+
+        # Vote loss targets
+        enlarged_gt_bboxes_3d = gt_bboxes_3d.enlarged_box(
+            self.train_cfg.expand_dims_length)
+        enlarged_gt_bboxes_3d.tensor[:, 2] -= self.train_cfg.expand_dims_length
+        vote_mask, vote_assignment = self._assign_targets_by_points_inside(
+            enlarged_gt_bboxes_3d, seed_points)
+
+        vote_targets = gt_bboxes_3d.gravity_center
+        vote_targets = vote_targets[vote_assignment] - seed_points
+        vote_mask = vote_mask.max(1)[0] > 0
+
+        return (vote_targets, center_targets, size_res_targets,
+                dir_class_targets, dir_res_targets, mask_targets,
+                centerness_targets, corner3d_targets, vote_mask, positive_mask,
+                negative_mask)
+
+    def get_bboxes(self, points, bbox_preds, input_metas, rescale=False):
+        """Generate bboxes from 3DSSD head predictions.
+
+        Args:
+            points (torch.Tensor): Input points.
+            bbox_preds (dict): Predictions from sdd3d head.
+            input_metas (list[dict]): Point cloud and image's meta info.
+            rescale (bool): Whether to rescale bboxes.
+
+        Returns:
+            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
+        """
+        # decode boxes
+        sem_scores = F.sigmoid(bbox_preds['obj_scores']).transpose(1, 2)
+        obj_scores = sem_scores.max(-1)[0]
+        bbox3d = self.bbox_coder.decode(bbox_preds)
+
+        batch_size = bbox3d.shape[0]
+        results = list()
+
+        for b in range(batch_size):
+            bbox_selected, score_selected, labels = self.multiclass_nms_single(
+                obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3],
+                input_metas[b])
+
+            bbox = input_metas[b]['box_type_3d'](
+                bbox_selected.clone(),
+                box_dim=bbox_selected.shape[-1],
+                with_yaw=self.bbox_coder.with_rot)
+            results.append((bbox, score_selected, labels))
+
+        return results
+
+    def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
+                              input_meta):
+        """Multi-class nms in single batch.
+
+        Args:
+            obj_scores (torch.Tensor): Objectness score of bounding boxes.
+            sem_scores (torch.Tensor): Semantic class score of bounding boxes.
+            bbox (torch.Tensor): Predicted bounding boxes.
+            points (torch.Tensor): Input points.
+            input_meta (dict): Point cloud and image's meta info.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        bbox = input_meta['box_type_3d'](
+            bbox.clone(),
+            box_dim=bbox.shape[-1],
+            with_yaw=self.bbox_coder.with_rot,
+            origin=(0.5, 0.5, 0.5))
+
+        if isinstance(bbox, (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            box_indices = bbox.points_in_boxes_all(points)
+            nonempty_box_mask = box_indices.T.sum(1) >= 0
+        else:
+            raise NotImplementedError('Unsupported bbox type!')
+
+        corner3d = bbox.corners
+        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
+        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
+        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
+
+        bbox_classes = torch.argmax(sem_scores, -1)
+        nms_keep = batched_nms(
+            minmax_box3d[nonempty_box_mask][:, [0, 1, 3, 4]],
+            obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask],
+            self.test_cfg.nms_cfg)[1]
+
+        if nms_keep.shape[0] > self.test_cfg.max_output_num:
+            nms_keep = nms_keep[:self.test_cfg.max_output_num]
+
+        # filter empty boxes and boxes with low score
+        scores_mask = (obj_scores >= self.test_cfg.score_thr)
+        nonempty_box_inds = torch.nonzero(
+            nonempty_box_mask, as_tuple=False).flatten()
+        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
+            0, nonempty_box_inds[nms_keep], 1)
+        selected = (nonempty_mask.bool() & scores_mask.bool())
+
+        if self.test_cfg.per_class_proposal:
+            bbox_selected, score_selected, labels = [], [], []
+            for k in range(sem_scores.shape[-1]):
+                bbox_selected.append(bbox[selected].tensor)
+                score_selected.append(obj_scores[selected])
+                labels.append(
+                    torch.zeros_like(bbox_classes[selected]).fill_(k))
+            bbox_selected = torch.cat(bbox_selected, 0)
+            score_selected = torch.cat(score_selected, 0)
+            labels = torch.cat(labels, 0)
+        else:
+            bbox_selected = bbox[selected].tensor
+            score_selected = obj_scores[selected]
+            labels = bbox_classes[selected]
+
+        return bbox_selected, score_selected, labels
+
+    def _assign_targets_by_points_inside(self, bboxes_3d, points):
+        """Compute assignment by checking whether point is inside bbox.
+
+        Args:
+            bboxes_3d (BaseInstance3DBoxes): Instance of bounding boxes.
+            points (torch.Tensor): Points of a batch.
+
+        Returns:
+            tuple[torch.Tensor]: Flags indicating whether each point is
+                inside bbox and the index of box where each point are in.
+        """
+        if isinstance(bboxes_3d, (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            points_mask = bboxes_3d.points_in_boxes_all(points)
+            assignment = points_mask.argmax(dim=-1)
+        else:
+            raise NotImplementedError('Unsupported bbox type!')
+
+        return points_mask, assignment
diff --git a/mmdet3d/models/dense_heads/train_mixins.py b/mmdet3d/models/dense_heads/train_mixins.py
index 90c9cbb..046bfe6 100644
--- a/mmdet3d/models/dense_heads/train_mixins.py
+++ b/mmdet3d/models/dense_heads/train_mixins.py
@@ -1,349 +1,349 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-from mmdet3d.core import limit_period
-from mmdet.core import images_to_levels, multi_apply
-
-
-class AnchorTrainMixin(object):
-    """Mixin class for target assigning of dense heads."""
-
-    def anchor_target_3d(self,
-                         anchor_list,
-                         gt_bboxes_list,
-                         input_metas,
-                         gt_bboxes_ignore_list=None,
-                         gt_labels_list=None,
-                         label_channels=1,
-                         num_classes=1,
-                         sampling=True):
-        """Compute regression and classification targets for anchors.
-
-        Args:
-            anchor_list (list[list]): Multi level anchors of each image.
-            gt_bboxes_list (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each image.
-            input_metas (list[dict]): Meta info of each image.
-            gt_bboxes_ignore_list (list): Ignore list of gt bboxes.
-            gt_labels_list (list[torch.Tensor]): Gt labels of batches.
-            label_channels (int): The channel of labels.
-            num_classes (int): The number of classes.
-            sampling (bool): Whether to sample anchors.
-
-        Returns:
-            tuple (list, list, list, list, list, list, int, int):
-                Anchor targets, including labels, label weights,
-                bbox targets, bbox weights, direction targets,
-                direction weights, number of positive anchors and
-                number of negative anchors.
-        """
-        num_imgs = len(input_metas)
-        assert len(anchor_list) == num_imgs
-
-        if isinstance(anchor_list[0][0], list):
-            # sizes of anchors are different
-            # anchor number of a single level
-            num_level_anchors = [
-                sum([anchor.size(0) for anchor in anchors])
-                for anchors in anchor_list[0]
-            ]
-            for i in range(num_imgs):
-                anchor_list[i] = anchor_list[i][0]
-        else:
-            # anchor number of multi levels
-            num_level_anchors = [
-                anchors.view(-1, self.box_code_size).size(0)
-                for anchors in anchor_list[0]
-            ]
-            # concat all level anchors and flags to a single tensor
-            for i in range(num_imgs):
-                anchor_list[i] = torch.cat(anchor_list[i])
-
-        # compute targets for each image
-        if gt_bboxes_ignore_list is None:
-            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
-        if gt_labels_list is None:
-            gt_labels_list = [None for _ in range(num_imgs)]
-
-        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
-         all_dir_targets, all_dir_weights, pos_inds_list,
-         neg_inds_list) = multi_apply(
-             self.anchor_target_3d_single,
-             anchor_list,
-             gt_bboxes_list,
-             gt_bboxes_ignore_list,
-             gt_labels_list,
-             input_metas,
-             label_channels=label_channels,
-             num_classes=num_classes,
-             sampling=sampling)
-
-        # no valid anchors
-        if any([labels is None for labels in all_labels]):
-            return None
-        # sampled anchors of all images
-        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
-        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
-        # split targets to a list w.r.t. multiple levels
-        labels_list = images_to_levels(all_labels, num_level_anchors)
-        label_weights_list = images_to_levels(all_label_weights,
-                                              num_level_anchors)
-        bbox_targets_list = images_to_levels(all_bbox_targets,
-                                             num_level_anchors)
-        bbox_weights_list = images_to_levels(all_bbox_weights,
-                                             num_level_anchors)
-        dir_targets_list = images_to_levels(all_dir_targets, num_level_anchors)
-        dir_weights_list = images_to_levels(all_dir_weights, num_level_anchors)
-        return (labels_list, label_weights_list, bbox_targets_list,
-                bbox_weights_list, dir_targets_list, dir_weights_list,
-                num_total_pos, num_total_neg)
-
-    def anchor_target_3d_single(self,
-                                anchors,
-                                gt_bboxes,
-                                gt_bboxes_ignore,
-                                gt_labels,
-                                input_meta,
-                                label_channels=1,
-                                num_classes=1,
-                                sampling=True):
-        """Compute targets of anchors in single batch.
-
-        Args:
-            anchors (torch.Tensor): Concatenated multi-level anchor.
-            gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes.
-            gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes.
-            gt_labels (torch.Tensor): Gt class labels.
-            input_meta (dict): Meta info of each image.
-            label_channels (int): The channel of labels.
-            num_classes (int): The number of classes.
-            sampling (bool): Whether to sample anchors.
-
-        Returns:
-            tuple[torch.Tensor]: Anchor targets.
-        """
-        if isinstance(self.bbox_assigner,
-                      list) and (not isinstance(anchors, list)):
-            feat_size = anchors.size(0) * anchors.size(1) * anchors.size(2)
-            rot_angles = anchors.size(-2)
-            assert len(self.bbox_assigner) == anchors.size(-3)
-            (total_labels, total_label_weights, total_bbox_targets,
-             total_bbox_weights, total_dir_targets, total_dir_weights,
-             total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], []
-            current_anchor_num = 0
-            for i, assigner in enumerate(self.bbox_assigner):
-                current_anchors = anchors[..., i, :, :].reshape(
-                    -1, self.box_code_size)
-                current_anchor_num += current_anchors.size(0)
-                if self.assign_per_class:
-                    gt_per_cls = (gt_labels == i)
-                    anchor_targets = self.anchor_target_single_assigner(
-                        assigner, current_anchors, gt_bboxes[gt_per_cls, :],
-                        gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta,
-                        num_classes, sampling)
-                else:
-                    anchor_targets = self.anchor_target_single_assigner(
-                        assigner, current_anchors, gt_bboxes, gt_bboxes_ignore,
-                        gt_labels, input_meta, num_classes, sampling)
-
-                (labels, label_weights, bbox_targets, bbox_weights,
-                 dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
-                total_labels.append(labels.reshape(feat_size, 1, rot_angles))
-                total_label_weights.append(
-                    label_weights.reshape(feat_size, 1, rot_angles))
-                total_bbox_targets.append(
-                    bbox_targets.reshape(feat_size, 1, rot_angles,
-                                         anchors.size(-1)))
-                total_bbox_weights.append(
-                    bbox_weights.reshape(feat_size, 1, rot_angles,
-                                         anchors.size(-1)))
-                total_dir_targets.append(
-                    dir_targets.reshape(feat_size, 1, rot_angles))
-                total_dir_weights.append(
-                    dir_weights.reshape(feat_size, 1, rot_angles))
-                total_pos_inds.append(pos_inds)
-                total_neg_inds.append(neg_inds)
-
-            total_labels = torch.cat(total_labels, dim=-2).reshape(-1)
-            total_label_weights = torch.cat(
-                total_label_weights, dim=-2).reshape(-1)
-            total_bbox_targets = torch.cat(
-                total_bbox_targets, dim=-3).reshape(-1, anchors.size(-1))
-            total_bbox_weights = torch.cat(
-                total_bbox_weights, dim=-3).reshape(-1, anchors.size(-1))
-            total_dir_targets = torch.cat(
-                total_dir_targets, dim=-2).reshape(-1)
-            total_dir_weights = torch.cat(
-                total_dir_weights, dim=-2).reshape(-1)
-            total_pos_inds = torch.cat(total_pos_inds, dim=0).reshape(-1)
-            total_neg_inds = torch.cat(total_neg_inds, dim=0).reshape(-1)
-            return (total_labels, total_label_weights, total_bbox_targets,
-                    total_bbox_weights, total_dir_targets, total_dir_weights,
-                    total_pos_inds, total_neg_inds)
-        elif isinstance(self.bbox_assigner, list) and isinstance(
-                anchors, list):
-            # class-aware anchors with different feature map sizes
-            assert len(self.bbox_assigner) == len(anchors), \
-                'The number of bbox assigners and anchors should be the same.'
-            (total_labels, total_label_weights, total_bbox_targets,
-             total_bbox_weights, total_dir_targets, total_dir_weights,
-             total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], []
-            current_anchor_num = 0
-            for i, assigner in enumerate(self.bbox_assigner):
-                current_anchors = anchors[i]
-                current_anchor_num += current_anchors.size(0)
-                if self.assign_per_class:
-                    gt_per_cls = (gt_labels == i)
-                    anchor_targets = self.anchor_target_single_assigner(
-                        assigner, current_anchors, gt_bboxes[gt_per_cls, :],
-                        gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta,
-                        num_classes, sampling)
-                else:
-                    anchor_targets = self.anchor_target_single_assigner(
-                        assigner, current_anchors, gt_bboxes, gt_bboxes_ignore,
-                        gt_labels, input_meta, num_classes, sampling)
-
-                (labels, label_weights, bbox_targets, bbox_weights,
-                 dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
-                total_labels.append(labels)
-                total_label_weights.append(label_weights)
-                total_bbox_targets.append(
-                    bbox_targets.reshape(-1, anchors[i].size(-1)))
-                total_bbox_weights.append(
-                    bbox_weights.reshape(-1, anchors[i].size(-1)))
-                total_dir_targets.append(dir_targets)
-                total_dir_weights.append(dir_weights)
-                total_pos_inds.append(pos_inds)
-                total_neg_inds.append(neg_inds)
-
-            total_labels = torch.cat(total_labels, dim=0)
-            total_label_weights = torch.cat(total_label_weights, dim=0)
-            total_bbox_targets = torch.cat(total_bbox_targets, dim=0)
-            total_bbox_weights = torch.cat(total_bbox_weights, dim=0)
-            total_dir_targets = torch.cat(total_dir_targets, dim=0)
-            total_dir_weights = torch.cat(total_dir_weights, dim=0)
-            total_pos_inds = torch.cat(total_pos_inds, dim=0)
-            total_neg_inds = torch.cat(total_neg_inds, dim=0)
-            return (total_labels, total_label_weights, total_bbox_targets,
-                    total_bbox_weights, total_dir_targets, total_dir_weights,
-                    total_pos_inds, total_neg_inds)
-        else:
-            return self.anchor_target_single_assigner(self.bbox_assigner,
-                                                      anchors, gt_bboxes,
-                                                      gt_bboxes_ignore,
-                                                      gt_labels, input_meta,
-                                                      num_classes, sampling)
-
-    def anchor_target_single_assigner(self,
-                                      bbox_assigner,
-                                      anchors,
-                                      gt_bboxes,
-                                      gt_bboxes_ignore,
-                                      gt_labels,
-                                      input_meta,
-                                      num_classes=1,
-                                      sampling=True):
-        """Assign anchors and encode positive anchors.
-
-        Args:
-            bbox_assigner (BaseAssigner): assign positive and negative boxes.
-            anchors (torch.Tensor): Concatenated multi-level anchor.
-            gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes.
-            gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes.
-            gt_labels (torch.Tensor): Gt class labels.
-            input_meta (dict): Meta info of each image.
-            num_classes (int): The number of classes.
-            sampling (bool): Whether to sample anchors.
-
-        Returns:
-            tuple[torch.Tensor]: Anchor targets.
-        """
-        anchors = anchors.reshape(-1, anchors.size(-1))
-        num_valid_anchors = anchors.shape[0]
-        bbox_targets = torch.zeros_like(anchors)
-        bbox_weights = torch.zeros_like(anchors)
-        dir_targets = anchors.new_zeros((anchors.shape[0]), dtype=torch.long)
-        dir_weights = anchors.new_zeros((anchors.shape[0]), dtype=torch.float)
-        labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long)
-        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
-        if len(gt_bboxes) > 0:
-            if not isinstance(gt_bboxes, torch.Tensor):
-                gt_bboxes = gt_bboxes.tensor.to(anchors.device)
-            assign_result = bbox_assigner.assign(anchors, gt_bboxes,
-                                                 gt_bboxes_ignore, gt_labels)
-            sampling_result = self.bbox_sampler.sample(assign_result, anchors,
-                                                       gt_bboxes)
-            pos_inds = sampling_result.pos_inds
-            neg_inds = sampling_result.neg_inds
-        else:
-            pos_inds = torch.nonzero(
-                anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) > 0,
-                as_tuple=False).squeeze(-1).unique()
-            neg_inds = torch.nonzero(
-                anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) == 0,
-                as_tuple=False).squeeze(-1).unique()
-
-        if gt_labels is not None:
-            labels += num_classes
-        if len(pos_inds) > 0:
-            pos_bbox_targets = self.bbox_coder.encode(
-                sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
-            pos_dir_targets = get_direction_target(
-                sampling_result.pos_bboxes,
-                pos_bbox_targets,
-                self.dir_offset,
-                self.dir_limit_offset,
-                one_hot=False)
-            bbox_targets[pos_inds, :] = pos_bbox_targets
-            bbox_weights[pos_inds, :] = 1.0
-            dir_targets[pos_inds] = pos_dir_targets
-            dir_weights[pos_inds] = 1.0
-
-            if gt_labels is None:
-                labels[pos_inds] = 1
-            else:
-                labels[pos_inds] = gt_labels[
-                    sampling_result.pos_assigned_gt_inds]
-            if self.train_cfg.pos_weight <= 0:
-                label_weights[pos_inds] = 1.0
-            else:
-                label_weights[pos_inds] = self.train_cfg.pos_weight
-
-        if len(neg_inds) > 0:
-            label_weights[neg_inds] = 1.0
-        return (labels, label_weights, bbox_targets, bbox_weights, dir_targets,
-                dir_weights, pos_inds, neg_inds)
-
-
-def get_direction_target(anchors,
-                         reg_targets,
-                         dir_offset=0,
-                         dir_limit_offset=0,
-                         num_bins=2,
-                         one_hot=True):
-    """Encode direction to 0 ~ num_bins-1.
-
-    Args:
-        anchors (torch.Tensor): Concatenated multi-level anchor.
-        reg_targets (torch.Tensor): Bbox regression targets.
-        dir_offset (int): Direction offset.
-        num_bins (int): Number of bins to divide 2*PI.
-        one_hot (bool): Whether to encode as one hot.
-
-    Returns:
-        torch.Tensor: Encoded direction targets.
-    """
-    rot_gt = reg_targets[..., 6] + anchors[..., 6]
-    offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset, 2 * np.pi)
-    dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long()
-    dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
-    if one_hot:
-        dir_targets = torch.zeros(
-            *list(dir_cls_targets.shape),
-            num_bins,
-            dtype=anchors.dtype,
-            device=dir_cls_targets.device)
-        dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
-        dir_cls_targets = dir_targets
-    return dir_cls_targets
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmdet3d.core import limit_period
+from mmdet.core import images_to_levels, multi_apply
+
+
+class AnchorTrainMixin(object):
+    """Mixin class for target assigning of dense heads."""
+
+    def anchor_target_3d(self,
+                         anchor_list,
+                         gt_bboxes_list,
+                         input_metas,
+                         gt_bboxes_ignore_list=None,
+                         gt_labels_list=None,
+                         label_channels=1,
+                         num_classes=1,
+                         sampling=True):
+        """Compute regression and classification targets for anchors.
+
+        Args:
+            anchor_list (list[list]): Multi level anchors of each image.
+            gt_bboxes_list (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each image.
+            input_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list): Ignore list of gt bboxes.
+            gt_labels_list (list[torch.Tensor]): Gt labels of batches.
+            label_channels (int): The channel of labels.
+            num_classes (int): The number of classes.
+            sampling (bool): Whether to sample anchors.
+
+        Returns:
+            tuple (list, list, list, list, list, list, int, int):
+                Anchor targets, including labels, label weights,
+                bbox targets, bbox weights, direction targets,
+                direction weights, number of positive anchors and
+                number of negative anchors.
+        """
+        num_imgs = len(input_metas)
+        assert len(anchor_list) == num_imgs
+
+        if isinstance(anchor_list[0][0], list):
+            # sizes of anchors are different
+            # anchor number of a single level
+            num_level_anchors = [
+                sum([anchor.size(0) for anchor in anchors])
+                for anchors in anchor_list[0]
+            ]
+            for i in range(num_imgs):
+                anchor_list[i] = anchor_list[i][0]
+        else:
+            # anchor number of multi levels
+            num_level_anchors = [
+                anchors.view(-1, self.box_code_size).size(0)
+                for anchors in anchor_list[0]
+            ]
+            # concat all level anchors and flags to a single tensor
+            for i in range(num_imgs):
+                anchor_list[i] = torch.cat(anchor_list[i])
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         all_dir_targets, all_dir_weights, pos_inds_list,
+         neg_inds_list) = multi_apply(
+             self.anchor_target_3d_single,
+             anchor_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             input_metas,
+             label_channels=label_channels,
+             num_classes=num_classes,
+             sampling=sampling)
+
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        dir_targets_list = images_to_levels(all_dir_targets, num_level_anchors)
+        dir_weights_list = images_to_levels(all_dir_weights, num_level_anchors)
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, dir_targets_list, dir_weights_list,
+                num_total_pos, num_total_neg)
+
+    def anchor_target_3d_single(self,
+                                anchors,
+                                gt_bboxes,
+                                gt_bboxes_ignore,
+                                gt_labels,
+                                input_meta,
+                                label_channels=1,
+                                num_classes=1,
+                                sampling=True):
+        """Compute targets of anchors in single batch.
+
+        Args:
+            anchors (torch.Tensor): Concatenated multi-level anchor.
+            gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes.
+            gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes.
+            gt_labels (torch.Tensor): Gt class labels.
+            input_meta (dict): Meta info of each image.
+            label_channels (int): The channel of labels.
+            num_classes (int): The number of classes.
+            sampling (bool): Whether to sample anchors.
+
+        Returns:
+            tuple[torch.Tensor]: Anchor targets.
+        """
+        if isinstance(self.bbox_assigner,
+                      list) and (not isinstance(anchors, list)):
+            feat_size = anchors.size(0) * anchors.size(1) * anchors.size(2)
+            rot_angles = anchors.size(-2)
+            assert len(self.bbox_assigner) == anchors.size(-3)
+            (total_labels, total_label_weights, total_bbox_targets,
+             total_bbox_weights, total_dir_targets, total_dir_weights,
+             total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], []
+            current_anchor_num = 0
+            for i, assigner in enumerate(self.bbox_assigner):
+                current_anchors = anchors[..., i, :, :].reshape(
+                    -1, self.box_code_size)
+                current_anchor_num += current_anchors.size(0)
+                if self.assign_per_class:
+                    gt_per_cls = (gt_labels == i)
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_bboxes[gt_per_cls, :],
+                        gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta,
+                        num_classes, sampling)
+                else:
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_bboxes, gt_bboxes_ignore,
+                        gt_labels, input_meta, num_classes, sampling)
+
+                (labels, label_weights, bbox_targets, bbox_weights,
+                 dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
+                total_labels.append(labels.reshape(feat_size, 1, rot_angles))
+                total_label_weights.append(
+                    label_weights.reshape(feat_size, 1, rot_angles))
+                total_bbox_targets.append(
+                    bbox_targets.reshape(feat_size, 1, rot_angles,
+                                         anchors.size(-1)))
+                total_bbox_weights.append(
+                    bbox_weights.reshape(feat_size, 1, rot_angles,
+                                         anchors.size(-1)))
+                total_dir_targets.append(
+                    dir_targets.reshape(feat_size, 1, rot_angles))
+                total_dir_weights.append(
+                    dir_weights.reshape(feat_size, 1, rot_angles))
+                total_pos_inds.append(pos_inds)
+                total_neg_inds.append(neg_inds)
+
+            total_labels = torch.cat(total_labels, dim=-2).reshape(-1)
+            total_label_weights = torch.cat(
+                total_label_weights, dim=-2).reshape(-1)
+            total_bbox_targets = torch.cat(
+                total_bbox_targets, dim=-3).reshape(-1, anchors.size(-1))
+            total_bbox_weights = torch.cat(
+                total_bbox_weights, dim=-3).reshape(-1, anchors.size(-1))
+            total_dir_targets = torch.cat(
+                total_dir_targets, dim=-2).reshape(-1)
+            total_dir_weights = torch.cat(
+                total_dir_weights, dim=-2).reshape(-1)
+            total_pos_inds = torch.cat(total_pos_inds, dim=0).reshape(-1)
+            total_neg_inds = torch.cat(total_neg_inds, dim=0).reshape(-1)
+            return (total_labels, total_label_weights, total_bbox_targets,
+                    total_bbox_weights, total_dir_targets, total_dir_weights,
+                    total_pos_inds, total_neg_inds)
+        elif isinstance(self.bbox_assigner, list) and isinstance(
+                anchors, list):
+            # class-aware anchors with different feature map sizes
+            assert len(self.bbox_assigner) == len(anchors), \
+                'The number of bbox assigners and anchors should be the same.'
+            (total_labels, total_label_weights, total_bbox_targets,
+             total_bbox_weights, total_dir_targets, total_dir_weights,
+             total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], []
+            current_anchor_num = 0
+            for i, assigner in enumerate(self.bbox_assigner):
+                current_anchors = anchors[i]
+                current_anchor_num += current_anchors.size(0)
+                if self.assign_per_class:
+                    gt_per_cls = (gt_labels == i)
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_bboxes[gt_per_cls, :],
+                        gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta,
+                        num_classes, sampling)
+                else:
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_bboxes, gt_bboxes_ignore,
+                        gt_labels, input_meta, num_classes, sampling)
+
+                (labels, label_weights, bbox_targets, bbox_weights,
+                 dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
+                total_labels.append(labels)
+                total_label_weights.append(label_weights)
+                total_bbox_targets.append(
+                    bbox_targets.reshape(-1, anchors[i].size(-1)))
+                total_bbox_weights.append(
+                    bbox_weights.reshape(-1, anchors[i].size(-1)))
+                total_dir_targets.append(dir_targets)
+                total_dir_weights.append(dir_weights)
+                total_pos_inds.append(pos_inds)
+                total_neg_inds.append(neg_inds)
+
+            total_labels = torch.cat(total_labels, dim=0)
+            total_label_weights = torch.cat(total_label_weights, dim=0)
+            total_bbox_targets = torch.cat(total_bbox_targets, dim=0)
+            total_bbox_weights = torch.cat(total_bbox_weights, dim=0)
+            total_dir_targets = torch.cat(total_dir_targets, dim=0)
+            total_dir_weights = torch.cat(total_dir_weights, dim=0)
+            total_pos_inds = torch.cat(total_pos_inds, dim=0)
+            total_neg_inds = torch.cat(total_neg_inds, dim=0)
+            return (total_labels, total_label_weights, total_bbox_targets,
+                    total_bbox_weights, total_dir_targets, total_dir_weights,
+                    total_pos_inds, total_neg_inds)
+        else:
+            return self.anchor_target_single_assigner(self.bbox_assigner,
+                                                      anchors, gt_bboxes,
+                                                      gt_bboxes_ignore,
+                                                      gt_labels, input_meta,
+                                                      num_classes, sampling)
+
+    def anchor_target_single_assigner(self,
+                                      bbox_assigner,
+                                      anchors,
+                                      gt_bboxes,
+                                      gt_bboxes_ignore,
+                                      gt_labels,
+                                      input_meta,
+                                      num_classes=1,
+                                      sampling=True):
+        """Assign anchors and encode positive anchors.
+
+        Args:
+            bbox_assigner (BaseAssigner): assign positive and negative boxes.
+            anchors (torch.Tensor): Concatenated multi-level anchor.
+            gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes.
+            gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes.
+            gt_labels (torch.Tensor): Gt class labels.
+            input_meta (dict): Meta info of each image.
+            num_classes (int): The number of classes.
+            sampling (bool): Whether to sample anchors.
+
+        Returns:
+            tuple[torch.Tensor]: Anchor targets.
+        """
+        anchors = anchors.reshape(-1, anchors.size(-1))
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        dir_targets = anchors.new_zeros((anchors.shape[0]), dtype=torch.long)
+        dir_weights = anchors.new_zeros((anchors.shape[0]), dtype=torch.float)
+        labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        if len(gt_bboxes) > 0:
+            if not isinstance(gt_bboxes, torch.Tensor):
+                gt_bboxes = gt_bboxes.tensor.to(anchors.device)
+            assign_result = bbox_assigner.assign(anchors, gt_bboxes,
+                                                 gt_bboxes_ignore, gt_labels)
+            sampling_result = self.bbox_sampler.sample(assign_result, anchors,
+                                                       gt_bboxes)
+            pos_inds = sampling_result.pos_inds
+            neg_inds = sampling_result.neg_inds
+        else:
+            pos_inds = torch.nonzero(
+                anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) > 0,
+                as_tuple=False).squeeze(-1).unique()
+            neg_inds = torch.nonzero(
+                anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) == 0,
+                as_tuple=False).squeeze(-1).unique()
+
+        if gt_labels is not None:
+            labels += num_classes
+        if len(pos_inds) > 0:
+            pos_bbox_targets = self.bbox_coder.encode(
+                sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            pos_dir_targets = get_direction_target(
+                sampling_result.pos_bboxes,
+                pos_bbox_targets,
+                self.dir_offset,
+                self.dir_limit_offset,
+                one_hot=False)
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            dir_targets[pos_inds] = pos_dir_targets
+            dir_weights[pos_inds] = 1.0
+
+            if gt_labels is None:
+                labels[pos_inds] = 1
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+        return (labels, label_weights, bbox_targets, bbox_weights, dir_targets,
+                dir_weights, pos_inds, neg_inds)
+
+
+def get_direction_target(anchors,
+                         reg_targets,
+                         dir_offset=0,
+                         dir_limit_offset=0,
+                         num_bins=2,
+                         one_hot=True):
+    """Encode direction to 0 ~ num_bins-1.
+
+    Args:
+        anchors (torch.Tensor): Concatenated multi-level anchor.
+        reg_targets (torch.Tensor): Bbox regression targets.
+        dir_offset (int): Direction offset.
+        num_bins (int): Number of bins to divide 2*PI.
+        one_hot (bool): Whether to encode as one hot.
+
+    Returns:
+        torch.Tensor: Encoded direction targets.
+    """
+    rot_gt = reg_targets[..., 6] + anchors[..., 6]
+    offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset, 2 * np.pi)
+    dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long()
+    dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
+    if one_hot:
+        dir_targets = torch.zeros(
+            *list(dir_cls_targets.shape),
+            num_bins,
+            dtype=anchors.dtype,
+            device=dir_cls_targets.device)
+        dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
+        dir_cls_targets = dir_targets
+    return dir_cls_targets
diff --git a/mmdet3d/models/dense_heads/vote_head.py b/mmdet3d/models/dense_heads/vote_head.py
index 53b1154..fc3b79b 100644
--- a/mmdet3d/models/dense_heads/vote_head.py
+++ b/mmdet3d/models/dense_heads/vote_head.py
@@ -1,663 +1,663 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from mmcv.ops import furthest_point_sample
-from mmcv.runner import BaseModule, force_fp32
-from torch.nn import functional as F
-
-from mmdet3d.core.post_processing import aligned_3d_nms
-from mmdet3d.models.losses import chamfer_distance
-from mmdet3d.models.model_utils import VoteModule
-from mmdet3d.ops import build_sa_module
-from mmdet.core import build_bbox_coder, multi_apply
-from ..builder import HEADS, build_loss
-from .base_conv_bbox_head import BaseConvBboxHead
-
-
-@HEADS.register_module()
-class VoteHead(BaseModule):
-    r"""Bbox head of `Votenet <https://arxiv.org/abs/1904.09664>`_.
-
-    Args:
-        num_classes (int): The number of class.
-        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
-            decoding boxes.
-        train_cfg (dict): Config for training.
-        test_cfg (dict): Config for testing.
-        vote_module_cfg (dict): Config of VoteModule for point-wise votes.
-        vote_aggregation_cfg (dict): Config of vote aggregation layer.
-        pred_layer_cfg (dict): Config of classfication and regression
-            prediction layers.
-        conv_cfg (dict): Config of convolution in prediction layer.
-        norm_cfg (dict): Config of BN in prediction layer.
-        objectness_loss (dict): Config of objectness loss.
-        center_loss (dict): Config of center loss.
-        dir_class_loss (dict): Config of direction classification loss.
-        dir_res_loss (dict): Config of direction residual regression loss.
-        size_class_loss (dict): Config of size classification loss.
-        size_res_loss (dict): Config of size residual regression loss.
-        semantic_loss (dict): Config of point-wise semantic segmentation loss.
-    """
-
-    def __init__(self,
-                 num_classes,
-                 bbox_coder,
-                 train_cfg=None,
-                 test_cfg=None,
-                 vote_module_cfg=None,
-                 vote_aggregation_cfg=None,
-                 pred_layer_cfg=None,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 objectness_loss=None,
-                 center_loss=None,
-                 dir_class_loss=None,
-                 dir_res_loss=None,
-                 size_class_loss=None,
-                 size_res_loss=None,
-                 semantic_loss=None,
-                 iou_loss=None,
-                 init_cfg=None):
-        super(VoteHead, self).__init__(init_cfg=init_cfg)
-        self.num_classes = num_classes
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        self.gt_per_seed = vote_module_cfg['gt_per_seed']
-        self.num_proposal = vote_aggregation_cfg['num_point']
-
-        self.objectness_loss = build_loss(objectness_loss)
-        self.center_loss = build_loss(center_loss)
-        self.dir_res_loss = build_loss(dir_res_loss)
-        self.dir_class_loss = build_loss(dir_class_loss)
-        self.size_res_loss = build_loss(size_res_loss)
-        if size_class_loss is not None:
-            self.size_class_loss = build_loss(size_class_loss)
-        if semantic_loss is not None:
-            self.semantic_loss = build_loss(semantic_loss)
-        if iou_loss is not None:
-            self.iou_loss = build_loss(iou_loss)
-        else:
-            self.iou_loss = None
-
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-        self.num_sizes = self.bbox_coder.num_sizes
-        self.num_dir_bins = self.bbox_coder.num_dir_bins
-
-        self.vote_module = VoteModule(**vote_module_cfg)
-        self.vote_aggregation = build_sa_module(vote_aggregation_cfg)
-        self.fp16_enabled = False
-
-        # Bbox classification and regression
-        self.conv_pred = BaseConvBboxHead(
-            **pred_layer_cfg,
-            num_cls_out_channels=self._get_cls_out_channels(),
-            num_reg_out_channels=self._get_reg_out_channels())
-
-    def _get_cls_out_channels(self):
-        """Return the channel number of classification outputs."""
-        # Class numbers (k) + objectness (2)
-        return self.num_classes + 2
-
-    def _get_reg_out_channels(self):
-        """Return the channel number of regression outputs."""
-        # Objectness scores (2), center residual (3),
-        # heading class+residual (num_dir_bins*2),
-        # size class+residual(num_sizes*4)
-        return 3 + self.num_dir_bins * 2 + self.num_sizes * 4
-
-    def _extract_input(self, feat_dict):
-        """Extract inputs from features dictionary.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-
-        Returns:
-            torch.Tensor: Coordinates of input points.
-            torch.Tensor: Features of input points.
-            torch.Tensor: Indices of input points.
-        """
-
-        # for imvotenet
-        if 'seed_points' in feat_dict and \
-           'seed_features' in feat_dict and \
-           'seed_indices' in feat_dict:
-            seed_points = feat_dict['seed_points']
-            seed_features = feat_dict['seed_features']
-            seed_indices = feat_dict['seed_indices']
-        # for votenet
-        else:
-            seed_points = feat_dict['fp_xyz'][-1]
-            seed_features = feat_dict['fp_features'][-1]
-            seed_indices = feat_dict['fp_indices'][-1]
-
-        return seed_points, seed_features, seed_indices
-
-    def forward(self, feat_dict, sample_mod):
-        """Forward pass.
-
-        Note:
-            The forward of VoteHead is divided into 4 steps:
-
-                1. Generate vote_points from seed_points.
-                2. Aggregate vote_points.
-                3. Predict bbox and score.
-                4. Decode predictions.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-            sample_mod (str): Sample mode for vote aggregation layer.
-                valid modes are "vote", "seed", "random" and "spec".
-
-        Returns:
-            dict: Predictions of vote head.
-        """
-        assert sample_mod in ['vote', 'seed', 'random', 'spec']
-
-        seed_points, seed_features, seed_indices = self._extract_input(
-            feat_dict)
-
-        # 1. generate vote_points from seed_points
-        vote_points, vote_features, vote_offset = self.vote_module(
-            seed_points, seed_features)
-        results = dict(
-            seed_points=seed_points,
-            seed_indices=seed_indices,
-            vote_points=vote_points,
-            vote_features=vote_features,
-            vote_offset=vote_offset)
-
-        # 2. aggregate vote_points
-        if sample_mod == 'vote':
-            # use fps in vote_aggregation
-            aggregation_inputs = dict(
-                points_xyz=vote_points, features=vote_features)
-        elif sample_mod == 'seed':
-            # FPS on seed and choose the votes corresponding to the seeds
-            sample_indices = furthest_point_sample(seed_points,
-                                                   self.num_proposal)
-            aggregation_inputs = dict(
-                points_xyz=vote_points,
-                features=vote_features,
-                indices=sample_indices)
-        elif sample_mod == 'random':
-            # Random sampling from the votes
-            batch_size, num_seed = seed_points.shape[:2]
-            sample_indices = seed_points.new_tensor(
-                torch.randint(0, num_seed, (batch_size, self.num_proposal)),
-                dtype=torch.int32)
-            aggregation_inputs = dict(
-                points_xyz=vote_points,
-                features=vote_features,
-                indices=sample_indices)
-        elif sample_mod == 'spec':
-            # Specify the new center in vote_aggregation
-            aggregation_inputs = dict(
-                points_xyz=seed_points,
-                features=seed_features,
-                target_xyz=vote_points)
-        else:
-            raise NotImplementedError(
-                f'Sample mode {sample_mod} is not supported!')
-
-        vote_aggregation_ret = self.vote_aggregation(**aggregation_inputs)
-        aggregated_points, features, aggregated_indices = vote_aggregation_ret
-
-        results['aggregated_points'] = aggregated_points
-        results['aggregated_features'] = features
-        results['aggregated_indices'] = aggregated_indices
-
-        # 3. predict bbox and score
-        cls_predictions, reg_predictions = self.conv_pred(features)
-
-        # 4. decode predictions
-        decode_res = self.bbox_coder.split_pred(cls_predictions,
-                                                reg_predictions,
-                                                aggregated_points)
-
-        results.update(decode_res)
-
-        return results
-
-    @force_fp32(apply_to=('bbox_preds', ))
-    def loss(self,
-             bbox_preds,
-             points,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             pts_semantic_mask=None,
-             pts_instance_mask=None,
-             img_metas=None,
-             gt_bboxes_ignore=None,
-             ret_target=False):
-        """Compute loss.
-
-        Args:
-            bbox_preds (dict): Predictions from forward of vote head.
-            points (list[torch.Tensor]): Input points.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each sample.
-            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
-            pts_semantic_mask (list[torch.Tensor]): Point-wise
-                semantic mask.
-            pts_instance_mask (list[torch.Tensor]): Point-wise
-                instance mask.
-            img_metas (list[dict]): Contain pcd and img's meta info.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding.
-            ret_target (Bool): Return targets or not.
-
-        Returns:
-            dict: Losses of Votenet.
-        """
-        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
-                                   pts_semantic_mask, pts_instance_mask,
-                                   bbox_preds)
-        (vote_targets, vote_target_masks, size_class_targets, size_res_targets,
-         dir_class_targets, dir_res_targets, center_targets,
-         assigned_center_targets, mask_targets, valid_gt_masks,
-         objectness_targets, objectness_weights, box_loss_weights,
-         valid_gt_weights) = targets
-
-        # calculate vote loss
-        vote_loss = self.vote_module.get_loss(bbox_preds['seed_points'],
-                                              bbox_preds['vote_points'],
-                                              bbox_preds['seed_indices'],
-                                              vote_target_masks, vote_targets)
-
-        # calculate objectness loss
-        objectness_loss = self.objectness_loss(
-            bbox_preds['obj_scores'].transpose(2, 1),
-            objectness_targets,
-            weight=objectness_weights)
-
-        # calculate center loss
-        source2target_loss, target2source_loss = self.center_loss(
-            bbox_preds['center'],
-            center_targets,
-            src_weight=box_loss_weights,
-            dst_weight=valid_gt_weights)
-        center_loss = source2target_loss + target2source_loss
-
-        # calculate direction class loss
-        dir_class_loss = self.dir_class_loss(
-            bbox_preds['dir_class'].transpose(2, 1),
-            dir_class_targets,
-            weight=box_loss_weights)
-
-        # calculate direction residual loss
-        batch_size, proposal_num = size_class_targets.shape[:2]
-        heading_label_one_hot = vote_targets.new_zeros(
-            (batch_size, proposal_num, self.num_dir_bins))
-        heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
-        dir_res_norm = torch.sum(
-            bbox_preds['dir_res_norm'] * heading_label_one_hot, -1)
-        dir_res_loss = self.dir_res_loss(
-            dir_res_norm, dir_res_targets, weight=box_loss_weights)
-
-        # calculate size class loss
-        size_class_loss = self.size_class_loss(
-            bbox_preds['size_class'].transpose(2, 1),
-            size_class_targets,
-            weight=box_loss_weights)
-
-        # calculate size residual loss
-        one_hot_size_targets = vote_targets.new_zeros(
-            (batch_size, proposal_num, self.num_sizes))
-        one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1)
-        one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
-            -1).repeat(1, 1, 1, 3).contiguous()
-        size_residual_norm = torch.sum(
-            bbox_preds['size_res_norm'] * one_hot_size_targets_expand, 2)
-        box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat(
-            1, 1, 3)
-        size_res_loss = self.size_res_loss(
-            size_residual_norm,
-            size_res_targets,
-            weight=box_loss_weights_expand)
-
-        # calculate semantic loss
-        semantic_loss = self.semantic_loss(
-            bbox_preds['sem_scores'].transpose(2, 1),
-            mask_targets,
-            weight=box_loss_weights)
-
-        losses = dict(
-            vote_loss=vote_loss,
-            objectness_loss=objectness_loss,
-            semantic_loss=semantic_loss,
-            center_loss=center_loss,
-            dir_class_loss=dir_class_loss,
-            dir_res_loss=dir_res_loss,
-            size_class_loss=size_class_loss,
-            size_res_loss=size_res_loss)
-
-        if self.iou_loss:
-            corners_pred = self.bbox_coder.decode_corners(
-                bbox_preds['center'], size_residual_norm,
-                one_hot_size_targets_expand)
-            corners_target = self.bbox_coder.decode_corners(
-                assigned_center_targets, size_res_targets,
-                one_hot_size_targets_expand)
-            iou_loss = self.iou_loss(
-                corners_pred, corners_target, weight=box_loss_weights)
-            losses['iou_loss'] = iou_loss
-
-        if ret_target:
-            losses['targets'] = targets
-
-        return losses
-
-    def get_targets(self,
-                    points,
-                    gt_bboxes_3d,
-                    gt_labels_3d,
-                    pts_semantic_mask=None,
-                    pts_instance_mask=None,
-                    bbox_preds=None):
-        """Generate targets of vote head.
-
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
-            pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
-                label of each batch.
-            pts_instance_mask (list[torch.Tensor]): Point-wise instance
-                label of each batch.
-            bbox_preds (torch.Tensor): Bounding box predictions of vote head.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of vote head.
-        """
-        # find empty example
-        valid_gt_masks = list()
-        gt_num = list()
-        for index in range(len(gt_labels_3d)):
-            if len(gt_labels_3d[index]) == 0:
-                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
-                    1, gt_bboxes_3d[index].tensor.shape[-1])
-                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
-                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
-                valid_gt_masks.append(gt_labels_3d[index].new_zeros(1))
-                gt_num.append(1)
-            else:
-                valid_gt_masks.append(gt_labels_3d[index].new_ones(
-                    gt_labels_3d[index].shape))
-                gt_num.append(gt_labels_3d[index].shape[0])
-        max_gt_num = max(gt_num)
-
-        if pts_semantic_mask is None:
-            pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
-            pts_instance_mask = [None for i in range(len(gt_labels_3d))]
-
-        aggregated_points = [
-            bbox_preds['aggregated_points'][i]
-            for i in range(len(gt_labels_3d))
-        ]
-
-        (vote_targets, vote_target_masks, size_class_targets, size_res_targets,
-         dir_class_targets, dir_res_targets, center_targets,
-         assigned_center_targets, mask_targets, objectness_targets,
-         objectness_masks) = multi_apply(self.get_targets_single, points,
-                                         gt_bboxes_3d, gt_labels_3d,
-                                         pts_semantic_mask, pts_instance_mask,
-                                         aggregated_points)
-
-        # pad targets as original code of votenet.
-        for index in range(len(gt_labels_3d)):
-            pad_num = max_gt_num - gt_labels_3d[index].shape[0]
-            center_targets[index] = F.pad(center_targets[index],
-                                          (0, 0, 0, pad_num))
-            valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num))
-
-        vote_targets = torch.stack(vote_targets)
-        vote_target_masks = torch.stack(vote_target_masks)
-        center_targets = torch.stack(center_targets)
-        valid_gt_masks = torch.stack(valid_gt_masks)
-
-        assigned_center_targets = torch.stack(assigned_center_targets)
-        objectness_targets = torch.stack(objectness_targets)
-        objectness_weights = torch.stack(objectness_masks)
-        objectness_weights /= (torch.sum(objectness_weights) + 1e-6)
-        box_loss_weights = objectness_targets.float() / (
-            torch.sum(objectness_targets).float() + 1e-6)
-        valid_gt_weights = valid_gt_masks.float() / (
-            torch.sum(valid_gt_masks.float()) + 1e-6)
-        dir_class_targets = torch.stack(dir_class_targets)
-        dir_res_targets = torch.stack(dir_res_targets)
-        size_class_targets = torch.stack(size_class_targets)
-        size_res_targets = torch.stack(size_res_targets)
-        mask_targets = torch.stack(mask_targets)
-
-        return (vote_targets, vote_target_masks, size_class_targets,
-                size_res_targets, dir_class_targets, dir_res_targets,
-                center_targets, assigned_center_targets, mask_targets,
-                valid_gt_masks, objectness_targets, objectness_weights,
-                box_loss_weights, valid_gt_weights)
-
-    def get_targets_single(self,
-                           points,
-                           gt_bboxes_3d,
-                           gt_labels_3d,
-                           pts_semantic_mask=None,
-                           pts_instance_mask=None,
-                           aggregated_points=None):
-        """Generate targets of vote head for single batch.
-
-        Args:
-            points (torch.Tensor): Points of each batch.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
-                boxes of each batch.
-            gt_labels_3d (torch.Tensor): Labels of each batch.
-            pts_semantic_mask (torch.Tensor): Point-wise semantic
-                label of each batch.
-            pts_instance_mask (torch.Tensor): Point-wise instance
-                label of each batch.
-            aggregated_points (torch.Tensor): Aggregated points from
-                vote aggregation layer.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of vote head.
-        """
-        assert self.bbox_coder.with_rot or pts_semantic_mask is not None
-
-        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
-
-        # generate votes target
-        num_points = points.shape[0]
-        if self.bbox_coder.with_rot:
-            vote_targets = points.new_zeros([num_points, 3 * self.gt_per_seed])
-            vote_target_masks = points.new_zeros([num_points],
-                                                 dtype=torch.long)
-            vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
-            box_indices_all = gt_bboxes_3d.points_in_boxes_all(points)
-            for i in range(gt_labels_3d.shape[0]):
-                box_indices = box_indices_all[:, i]
-                indices = torch.nonzero(
-                    box_indices, as_tuple=False).squeeze(-1)
-                selected_points = points[indices]
-                vote_target_masks[indices] = 1
-                vote_targets_tmp = vote_targets[indices]
-                votes = gt_bboxes_3d.gravity_center[i].unsqueeze(
-                    0) - selected_points[:, :3]
-
-                for j in range(self.gt_per_seed):
-                    column_indices = torch.nonzero(
-                        vote_target_idx[indices] == j,
-                        as_tuple=False).squeeze(-1)
-                    vote_targets_tmp[column_indices,
-                                     int(j * 3):int(j * 3 +
-                                                    3)] = votes[column_indices]
-                    if j == 0:
-                        vote_targets_tmp[column_indices] = votes[
-                            column_indices].repeat(1, self.gt_per_seed)
-
-                vote_targets[indices] = vote_targets_tmp
-                vote_target_idx[indices] = torch.clamp(
-                    vote_target_idx[indices] + 1, max=2)
-        elif pts_semantic_mask is not None:
-            vote_targets = points.new_zeros([num_points, 3])
-            vote_target_masks = points.new_zeros([num_points],
-                                                 dtype=torch.long)
-
-            for i in torch.unique(pts_instance_mask):
-                indices = torch.nonzero(
-                    pts_instance_mask == i, as_tuple=False).squeeze(-1)
-                if pts_semantic_mask[indices[0]] < self.num_classes:
-                    selected_points = points[indices, :3]
-                    center = 0.5 * (
-                        selected_points.min(0)[0] + selected_points.max(0)[0])
-                    vote_targets[indices, :] = center - selected_points
-                    vote_target_masks[indices] = 1
-            vote_targets = vote_targets.repeat((1, self.gt_per_seed))
-        else:
-            raise NotImplementedError
-
-        (center_targets, size_class_targets, size_res_targets,
-         dir_class_targets,
-         dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
-
-        proposal_num = aggregated_points.shape[0]
-        distance1, _, assignment, _ = chamfer_distance(
-            aggregated_points.unsqueeze(0),
-            center_targets.unsqueeze(0),
-            reduction='none')
-        assignment = assignment.squeeze(0)
-        euclidean_distance1 = torch.sqrt(distance1.squeeze(0) + 1e-6)
-
-        objectness_targets = points.new_zeros((proposal_num), dtype=torch.long)
-        objectness_targets[
-            euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1
-
-        objectness_masks = points.new_zeros((proposal_num))
-        objectness_masks[
-            euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1.0
-        objectness_masks[
-            euclidean_distance1 > self.train_cfg['neg_distance_thr']] = 1.0
-
-        dir_class_targets = dir_class_targets[assignment]
-        dir_res_targets = dir_res_targets[assignment]
-        dir_res_targets /= (np.pi / self.num_dir_bins)
-        size_class_targets = size_class_targets[assignment]
-        size_res_targets = size_res_targets[assignment]
-
-        one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros(
-            (proposal_num, self.num_sizes))
-        one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1)
-        one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).repeat(
-            1, 1, 3)
-        mean_sizes = size_res_targets.new_tensor(
-            self.bbox_coder.mean_sizes).unsqueeze(0)
-        pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1)
-        size_res_targets /= pos_mean_sizes
-
-        mask_targets = gt_labels_3d[assignment]
-        assigned_center_targets = center_targets[assignment]
-
-        return (vote_targets, vote_target_masks, size_class_targets,
-                size_res_targets, dir_class_targets,
-                dir_res_targets, center_targets, assigned_center_targets,
-                mask_targets.long(), objectness_targets, objectness_masks)
-
-    def get_bboxes(self,
-                   points,
-                   bbox_preds,
-                   input_metas,
-                   rescale=False,
-                   use_nms=True):
-        """Generate bboxes from vote head predictions.
-
-        Args:
-            points (torch.Tensor): Input points.
-            bbox_preds (dict): Predictions from vote head.
-            input_metas (list[dict]): Point cloud and image's meta info.
-            rescale (bool): Whether to rescale bboxes.
-            use_nms (bool): Whether to apply NMS, skip nms postprocessing
-                while using vote head in rpn stage.
-
-        Returns:
-            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
-        """
-        # decode boxes
-        obj_scores = F.softmax(bbox_preds['obj_scores'], dim=-1)[..., -1]
-        sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1)
-        bbox3d = self.bbox_coder.decode(bbox_preds)
-
-        if use_nms:
-            batch_size = bbox3d.shape[0]
-            results = list()
-            for b in range(batch_size):
-                bbox_selected, score_selected, labels = \
-                    self.multiclass_nms_single(obj_scores[b], sem_scores[b],
-                                               bbox3d[b], points[b, ..., :3],
-                                               input_metas[b])
-                bbox = input_metas[b]['box_type_3d'](
-                    bbox_selected,
-                    box_dim=bbox_selected.shape[-1],
-                    with_yaw=self.bbox_coder.with_rot)
-                results.append((bbox, score_selected, labels))
-
-            return results
-        else:
-            return bbox3d
-
-    def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
-                              input_meta):
-        """Multi-class nms in single batch.
-
-        Args:
-            obj_scores (torch.Tensor): Objectness score of bounding boxes.
-            sem_scores (torch.Tensor): semantic class score of bounding boxes.
-            bbox (torch.Tensor): Predicted bounding boxes.
-            points (torch.Tensor): Input points.
-            input_meta (dict): Point cloud and image's meta info.
-
-        Returns:
-            tuple[torch.Tensor]: Bounding boxes, scores and labels.
-        """
-        bbox = input_meta['box_type_3d'](
-            bbox,
-            box_dim=bbox.shape[-1],
-            with_yaw=self.bbox_coder.with_rot,
-            origin=(0.5, 0.5, 0.5))
-        box_indices = bbox.points_in_boxes_all(points)
-
-        corner3d = bbox.corners
-        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
-        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
-        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
-
-        nonempty_box_mask = box_indices.T.sum(1) > 5
-
-        bbox_classes = torch.argmax(sem_scores, -1)
-        nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
-                                      obj_scores[nonempty_box_mask],
-                                      bbox_classes[nonempty_box_mask],
-                                      self.test_cfg.nms_thr)
-
-        # filter empty boxes and boxes with low score
-        scores_mask = (obj_scores > self.test_cfg.score_thr)
-        nonempty_box_inds = torch.nonzero(
-            nonempty_box_mask, as_tuple=False).flatten()
-        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
-            0, nonempty_box_inds[nms_selected], 1)
-        selected = (nonempty_mask.bool() & scores_mask.bool())
-
-        if self.test_cfg.per_class_proposal:
-            bbox_selected, score_selected, labels = [], [], []
-            for k in range(sem_scores.shape[-1]):
-                bbox_selected.append(bbox[selected].tensor)
-                score_selected.append(obj_scores[selected] *
-                                      sem_scores[selected][:, k])
-                labels.append(
-                    torch.zeros_like(bbox_classes[selected]).fill_(k))
-            bbox_selected = torch.cat(bbox_selected, 0)
-            score_selected = torch.cat(score_selected, 0)
-            labels = torch.cat(labels, 0)
-        else:
-            bbox_selected = bbox[selected].tensor
-            score_selected = obj_scores[selected]
-            labels = bbox_classes[selected]
-
-        return bbox_selected, score_selected, labels
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.ops import furthest_point_sample
+from mmcv.runner import BaseModule, force_fp32
+from torch.nn import functional as F
+
+from mmdet3d.core.post_processing import aligned_3d_nms
+from mmdet3d.models.losses import chamfer_distance
+from mmdet3d.models.model_utils import VoteModule
+from mmdet3d.ops import build_sa_module
+from mmdet.core import build_bbox_coder, multi_apply
+from ..builder import HEADS, build_loss
+from .base_conv_bbox_head import BaseConvBboxHead
+
+
+@HEADS.register_module()
+class VoteHead(BaseModule):
+    r"""Bbox head of `Votenet <https://arxiv.org/abs/1904.09664>`_.
+
+    Args:
+        num_classes (int): The number of class.
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
+            decoding boxes.
+        train_cfg (dict): Config for training.
+        test_cfg (dict): Config for testing.
+        vote_module_cfg (dict): Config of VoteModule for point-wise votes.
+        vote_aggregation_cfg (dict): Config of vote aggregation layer.
+        pred_layer_cfg (dict): Config of classfication and regression
+            prediction layers.
+        conv_cfg (dict): Config of convolution in prediction layer.
+        norm_cfg (dict): Config of BN in prediction layer.
+        objectness_loss (dict): Config of objectness loss.
+        center_loss (dict): Config of center loss.
+        dir_class_loss (dict): Config of direction classification loss.
+        dir_res_loss (dict): Config of direction residual regression loss.
+        size_class_loss (dict): Config of size classification loss.
+        size_res_loss (dict): Config of size residual regression loss.
+        semantic_loss (dict): Config of point-wise semantic segmentation loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 bbox_coder,
+                 train_cfg=None,
+                 test_cfg=None,
+                 vote_module_cfg=None,
+                 vote_aggregation_cfg=None,
+                 pred_layer_cfg=None,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 objectness_loss=None,
+                 center_loss=None,
+                 dir_class_loss=None,
+                 dir_res_loss=None,
+                 size_class_loss=None,
+                 size_res_loss=None,
+                 semantic_loss=None,
+                 iou_loss=None,
+                 init_cfg=None):
+        super(VoteHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.gt_per_seed = vote_module_cfg['gt_per_seed']
+        self.num_proposal = vote_aggregation_cfg['num_point']
+
+        self.objectness_loss = build_loss(objectness_loss)
+        self.center_loss = build_loss(center_loss)
+        self.dir_res_loss = build_loss(dir_res_loss)
+        self.dir_class_loss = build_loss(dir_class_loss)
+        self.size_res_loss = build_loss(size_res_loss)
+        if size_class_loss is not None:
+            self.size_class_loss = build_loss(size_class_loss)
+        if semantic_loss is not None:
+            self.semantic_loss = build_loss(semantic_loss)
+        if iou_loss is not None:
+            self.iou_loss = build_loss(iou_loss)
+        else:
+            self.iou_loss = None
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.num_sizes = self.bbox_coder.num_sizes
+        self.num_dir_bins = self.bbox_coder.num_dir_bins
+
+        self.vote_module = VoteModule(**vote_module_cfg)
+        self.vote_aggregation = build_sa_module(vote_aggregation_cfg)
+        self.fp16_enabled = False
+
+        # Bbox classification and regression
+        self.conv_pred = BaseConvBboxHead(
+            **pred_layer_cfg,
+            num_cls_out_channels=self._get_cls_out_channels(),
+            num_reg_out_channels=self._get_reg_out_channels())
+
+    def _get_cls_out_channels(self):
+        """Return the channel number of classification outputs."""
+        # Class numbers (k) + objectness (2)
+        return self.num_classes + 2
+
+    def _get_reg_out_channels(self):
+        """Return the channel number of regression outputs."""
+        # Objectness scores (2), center residual (3),
+        # heading class+residual (num_dir_bins*2),
+        # size class+residual(num_sizes*4)
+        return 3 + self.num_dir_bins * 2 + self.num_sizes * 4
+
+    def _extract_input(self, feat_dict):
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            torch.Tensor: Coordinates of input points.
+            torch.Tensor: Features of input points.
+            torch.Tensor: Indices of input points.
+        """
+
+        # for imvotenet
+        if 'seed_points' in feat_dict and \
+           'seed_features' in feat_dict and \
+           'seed_indices' in feat_dict:
+            seed_points = feat_dict['seed_points']
+            seed_features = feat_dict['seed_features']
+            seed_indices = feat_dict['seed_indices']
+        # for votenet
+        else:
+            seed_points = feat_dict['fp_xyz'][-1]
+            seed_features = feat_dict['fp_features'][-1]
+            seed_indices = feat_dict['fp_indices'][-1]
+
+        return seed_points, seed_features, seed_indices
+
+    def forward(self, feat_dict, sample_mod):
+        """Forward pass.
+
+        Note:
+            The forward of VoteHead is divided into 4 steps:
+
+                1. Generate vote_points from seed_points.
+                2. Aggregate vote_points.
+                3. Predict bbox and score.
+                4. Decode predictions.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+            sample_mod (str): Sample mode for vote aggregation layer.
+                valid modes are "vote", "seed", "random" and "spec".
+
+        Returns:
+            dict: Predictions of vote head.
+        """
+        assert sample_mod in ['vote', 'seed', 'random', 'spec']
+
+        seed_points, seed_features, seed_indices = self._extract_input(
+            feat_dict)
+
+        # 1. generate vote_points from seed_points
+        vote_points, vote_features, vote_offset = self.vote_module(
+            seed_points, seed_features)
+        results = dict(
+            seed_points=seed_points,
+            seed_indices=seed_indices,
+            vote_points=vote_points,
+            vote_features=vote_features,
+            vote_offset=vote_offset)
+
+        # 2. aggregate vote_points
+        if sample_mod == 'vote':
+            # use fps in vote_aggregation
+            aggregation_inputs = dict(
+                points_xyz=vote_points, features=vote_features)
+        elif sample_mod == 'seed':
+            # FPS on seed and choose the votes corresponding to the seeds
+            sample_indices = furthest_point_sample(seed_points,
+                                                   self.num_proposal)
+            aggregation_inputs = dict(
+                points_xyz=vote_points,
+                features=vote_features,
+                indices=sample_indices)
+        elif sample_mod == 'random':
+            # Random sampling from the votes
+            batch_size, num_seed = seed_points.shape[:2]
+            sample_indices = seed_points.new_tensor(
+                torch.randint(0, num_seed, (batch_size, self.num_proposal)),
+                dtype=torch.int32)
+            aggregation_inputs = dict(
+                points_xyz=vote_points,
+                features=vote_features,
+                indices=sample_indices)
+        elif sample_mod == 'spec':
+            # Specify the new center in vote_aggregation
+            aggregation_inputs = dict(
+                points_xyz=seed_points,
+                features=seed_features,
+                target_xyz=vote_points)
+        else:
+            raise NotImplementedError(
+                f'Sample mode {sample_mod} is not supported!')
+
+        vote_aggregation_ret = self.vote_aggregation(**aggregation_inputs)
+        aggregated_points, features, aggregated_indices = vote_aggregation_ret
+
+        results['aggregated_points'] = aggregated_points
+        results['aggregated_features'] = features
+        results['aggregated_indices'] = aggregated_indices
+
+        # 3. predict bbox and score
+        cls_predictions, reg_predictions = self.conv_pred(features)
+
+        # 4. decode predictions
+        decode_res = self.bbox_coder.split_pred(cls_predictions,
+                                                reg_predictions,
+                                                aggregated_points)
+
+        results.update(decode_res)
+
+        return results
+
+    @force_fp32(apply_to=('bbox_preds', ))
+    def loss(self,
+             bbox_preds,
+             points,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             pts_semantic_mask=None,
+             pts_instance_mask=None,
+             img_metas=None,
+             gt_bboxes_ignore=None,
+             ret_target=False):
+        """Compute loss.
+
+        Args:
+            bbox_preds (dict): Predictions from forward of vote head.
+            points (list[torch.Tensor]): Input points.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each sample.
+            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
+            pts_semantic_mask (list[torch.Tensor]): Point-wise
+                semantic mask.
+            pts_instance_mask (list[torch.Tensor]): Point-wise
+                instance mask.
+            img_metas (list[dict]): Contain pcd and img's meta info.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding.
+            ret_target (Bool): Return targets or not.
+
+        Returns:
+            dict: Losses of Votenet.
+        """
+        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
+                                   pts_semantic_mask, pts_instance_mask,
+                                   bbox_preds)
+        (vote_targets, vote_target_masks, size_class_targets, size_res_targets,
+         dir_class_targets, dir_res_targets, center_targets,
+         assigned_center_targets, mask_targets, valid_gt_masks,
+         objectness_targets, objectness_weights, box_loss_weights,
+         valid_gt_weights) = targets
+
+        # calculate vote loss
+        vote_loss = self.vote_module.get_loss(bbox_preds['seed_points'],
+                                              bbox_preds['vote_points'],
+                                              bbox_preds['seed_indices'],
+                                              vote_target_masks, vote_targets)
+
+        # calculate objectness loss
+        objectness_loss = self.objectness_loss(
+            bbox_preds['obj_scores'].transpose(2, 1),
+            objectness_targets,
+            weight=objectness_weights)
+
+        # calculate center loss
+        source2target_loss, target2source_loss = self.center_loss(
+            bbox_preds['center'],
+            center_targets,
+            src_weight=box_loss_weights,
+            dst_weight=valid_gt_weights)
+        center_loss = source2target_loss + target2source_loss
+
+        # calculate direction class loss
+        dir_class_loss = self.dir_class_loss(
+            bbox_preds['dir_class'].transpose(2, 1),
+            dir_class_targets,
+            weight=box_loss_weights)
+
+        # calculate direction residual loss
+        batch_size, proposal_num = size_class_targets.shape[:2]
+        heading_label_one_hot = vote_targets.new_zeros(
+            (batch_size, proposal_num, self.num_dir_bins))
+        heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
+        dir_res_norm = torch.sum(
+            bbox_preds['dir_res_norm'] * heading_label_one_hot, -1)
+        dir_res_loss = self.dir_res_loss(
+            dir_res_norm, dir_res_targets, weight=box_loss_weights)
+
+        # calculate size class loss
+        size_class_loss = self.size_class_loss(
+            bbox_preds['size_class'].transpose(2, 1),
+            size_class_targets,
+            weight=box_loss_weights)
+
+        # calculate size residual loss
+        one_hot_size_targets = vote_targets.new_zeros(
+            (batch_size, proposal_num, self.num_sizes))
+        one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1)
+        one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
+            -1).repeat(1, 1, 1, 3).contiguous()
+        size_residual_norm = torch.sum(
+            bbox_preds['size_res_norm'] * one_hot_size_targets_expand, 2)
+        box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat(
+            1, 1, 3)
+        size_res_loss = self.size_res_loss(
+            size_residual_norm,
+            size_res_targets,
+            weight=box_loss_weights_expand)
+
+        # calculate semantic loss
+        semantic_loss = self.semantic_loss(
+            bbox_preds['sem_scores'].transpose(2, 1),
+            mask_targets,
+            weight=box_loss_weights)
+
+        losses = dict(
+            vote_loss=vote_loss,
+            objectness_loss=objectness_loss,
+            semantic_loss=semantic_loss,
+            center_loss=center_loss,
+            dir_class_loss=dir_class_loss,
+            dir_res_loss=dir_res_loss,
+            size_class_loss=size_class_loss,
+            size_res_loss=size_res_loss)
+
+        if self.iou_loss:
+            corners_pred = self.bbox_coder.decode_corners(
+                bbox_preds['center'], size_residual_norm,
+                one_hot_size_targets_expand)
+            corners_target = self.bbox_coder.decode_corners(
+                assigned_center_targets, size_res_targets,
+                one_hot_size_targets_expand)
+            iou_loss = self.iou_loss(
+                corners_pred, corners_target, weight=box_loss_weights)
+            losses['iou_loss'] = iou_loss
+
+        if ret_target:
+            losses['targets'] = targets
+
+        return losses
+
+    def get_targets(self,
+                    points,
+                    gt_bboxes_3d,
+                    gt_labels_3d,
+                    pts_semantic_mask=None,
+                    pts_instance_mask=None,
+                    bbox_preds=None):
+        """Generate targets of vote head.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each batch.
+            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
+            pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (list[torch.Tensor]): Point-wise instance
+                label of each batch.
+            bbox_preds (torch.Tensor): Bounding box predictions of vote head.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of vote head.
+        """
+        # find empty example
+        valid_gt_masks = list()
+        gt_num = list()
+        for index in range(len(gt_labels_3d)):
+            if len(gt_labels_3d[index]) == 0:
+                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
+                    1, gt_bboxes_3d[index].tensor.shape[-1])
+                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
+                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
+                valid_gt_masks.append(gt_labels_3d[index].new_zeros(1))
+                gt_num.append(1)
+            else:
+                valid_gt_masks.append(gt_labels_3d[index].new_ones(
+                    gt_labels_3d[index].shape))
+                gt_num.append(gt_labels_3d[index].shape[0])
+        max_gt_num = max(gt_num)
+
+        if pts_semantic_mask is None:
+            pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
+            pts_instance_mask = [None for i in range(len(gt_labels_3d))]
+
+        aggregated_points = [
+            bbox_preds['aggregated_points'][i]
+            for i in range(len(gt_labels_3d))
+        ]
+
+        (vote_targets, vote_target_masks, size_class_targets, size_res_targets,
+         dir_class_targets, dir_res_targets, center_targets,
+         assigned_center_targets, mask_targets, objectness_targets,
+         objectness_masks) = multi_apply(self.get_targets_single, points,
+                                         gt_bboxes_3d, gt_labels_3d,
+                                         pts_semantic_mask, pts_instance_mask,
+                                         aggregated_points)
+
+        # pad targets as original code of votenet.
+        for index in range(len(gt_labels_3d)):
+            pad_num = max_gt_num - gt_labels_3d[index].shape[0]
+            center_targets[index] = F.pad(center_targets[index],
+                                          (0, 0, 0, pad_num))
+            valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num))
+
+        vote_targets = torch.stack(vote_targets)
+        vote_target_masks = torch.stack(vote_target_masks)
+        center_targets = torch.stack(center_targets)
+        valid_gt_masks = torch.stack(valid_gt_masks)
+
+        assigned_center_targets = torch.stack(assigned_center_targets)
+        objectness_targets = torch.stack(objectness_targets)
+        objectness_weights = torch.stack(objectness_masks)
+        objectness_weights /= (torch.sum(objectness_weights) + 1e-6)
+        box_loss_weights = objectness_targets.float() / (
+            torch.sum(objectness_targets).float() + 1e-6)
+        valid_gt_weights = valid_gt_masks.float() / (
+            torch.sum(valid_gt_masks.float()) + 1e-6)
+        dir_class_targets = torch.stack(dir_class_targets)
+        dir_res_targets = torch.stack(dir_res_targets)
+        size_class_targets = torch.stack(size_class_targets)
+        size_res_targets = torch.stack(size_res_targets)
+        mask_targets = torch.stack(mask_targets)
+
+        return (vote_targets, vote_target_masks, size_class_targets,
+                size_res_targets, dir_class_targets, dir_res_targets,
+                center_targets, assigned_center_targets, mask_targets,
+                valid_gt_masks, objectness_targets, objectness_weights,
+                box_loss_weights, valid_gt_weights)
+
+    def get_targets_single(self,
+                           points,
+                           gt_bboxes_3d,
+                           gt_labels_3d,
+                           pts_semantic_mask=None,
+                           pts_instance_mask=None,
+                           aggregated_points=None):
+        """Generate targets of vote head for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+            pts_semantic_mask (torch.Tensor): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (torch.Tensor): Point-wise instance
+                label of each batch.
+            aggregated_points (torch.Tensor): Aggregated points from
+                vote aggregation layer.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of vote head.
+        """
+        assert self.bbox_coder.with_rot or pts_semantic_mask is not None
+
+        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
+
+        # generate votes target
+        num_points = points.shape[0]
+        if self.bbox_coder.with_rot:
+            vote_targets = points.new_zeros([num_points, 3 * self.gt_per_seed])
+            vote_target_masks = points.new_zeros([num_points],
+                                                 dtype=torch.long)
+            vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
+            box_indices_all = gt_bboxes_3d.points_in_boxes_all(points)
+            for i in range(gt_labels_3d.shape[0]):
+                box_indices = box_indices_all[:, i]
+                indices = torch.nonzero(
+                    box_indices, as_tuple=False).squeeze(-1)
+                selected_points = points[indices]
+                vote_target_masks[indices] = 1
+                vote_targets_tmp = vote_targets[indices]
+                votes = gt_bboxes_3d.gravity_center[i].unsqueeze(
+                    0) - selected_points[:, :3]
+
+                for j in range(self.gt_per_seed):
+                    column_indices = torch.nonzero(
+                        vote_target_idx[indices] == j,
+                        as_tuple=False).squeeze(-1)
+                    vote_targets_tmp[column_indices,
+                                     int(j * 3):int(j * 3 +
+                                                    3)] = votes[column_indices]
+                    if j == 0:
+                        vote_targets_tmp[column_indices] = votes[
+                            column_indices].repeat(1, self.gt_per_seed)
+
+                vote_targets[indices] = vote_targets_tmp
+                vote_target_idx[indices] = torch.clamp(
+                    vote_target_idx[indices] + 1, max=2)
+        elif pts_semantic_mask is not None:
+            vote_targets = points.new_zeros([num_points, 3])
+            vote_target_masks = points.new_zeros([num_points],
+                                                 dtype=torch.long)
+
+            for i in torch.unique(pts_instance_mask):
+                indices = torch.nonzero(
+                    pts_instance_mask == i, as_tuple=False).squeeze(-1)
+                if pts_semantic_mask[indices[0]] < self.num_classes:
+                    selected_points = points[indices, :3]
+                    center = 0.5 * (
+                        selected_points.min(0)[0] + selected_points.max(0)[0])
+                    vote_targets[indices, :] = center - selected_points
+                    vote_target_masks[indices] = 1
+            vote_targets = vote_targets.repeat((1, self.gt_per_seed))
+        else:
+            raise NotImplementedError
+
+        (center_targets, size_class_targets, size_res_targets,
+         dir_class_targets,
+         dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
+
+        proposal_num = aggregated_points.shape[0]
+        distance1, _, assignment, _ = chamfer_distance(
+            aggregated_points.unsqueeze(0),
+            center_targets.unsqueeze(0),
+            reduction='none')
+        assignment = assignment.squeeze(0)
+        euclidean_distance1 = torch.sqrt(distance1.squeeze(0) + 1e-6)
+
+        objectness_targets = points.new_zeros((proposal_num), dtype=torch.long)
+        objectness_targets[
+            euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1
+
+        objectness_masks = points.new_zeros((proposal_num))
+        objectness_masks[
+            euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1.0
+        objectness_masks[
+            euclidean_distance1 > self.train_cfg['neg_distance_thr']] = 1.0
+
+        dir_class_targets = dir_class_targets[assignment]
+        dir_res_targets = dir_res_targets[assignment]
+        dir_res_targets /= (np.pi / self.num_dir_bins)
+        size_class_targets = size_class_targets[assignment]
+        size_res_targets = size_res_targets[assignment]
+
+        one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros(
+            (proposal_num, self.num_sizes))
+        one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1)
+        one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).repeat(
+            1, 1, 3)
+        mean_sizes = size_res_targets.new_tensor(
+            self.bbox_coder.mean_sizes).unsqueeze(0)
+        pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1)
+        size_res_targets /= pos_mean_sizes
+
+        mask_targets = gt_labels_3d[assignment]
+        assigned_center_targets = center_targets[assignment]
+
+        return (vote_targets, vote_target_masks, size_class_targets,
+                size_res_targets, dir_class_targets,
+                dir_res_targets, center_targets, assigned_center_targets,
+                mask_targets.long(), objectness_targets, objectness_masks)
+
+    def get_bboxes(self,
+                   points,
+                   bbox_preds,
+                   input_metas,
+                   rescale=False,
+                   use_nms=True):
+        """Generate bboxes from vote head predictions.
+
+        Args:
+            points (torch.Tensor): Input points.
+            bbox_preds (dict): Predictions from vote head.
+            input_metas (list[dict]): Point cloud and image's meta info.
+            rescale (bool): Whether to rescale bboxes.
+            use_nms (bool): Whether to apply NMS, skip nms postprocessing
+                while using vote head in rpn stage.
+
+        Returns:
+            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
+        """
+        # decode boxes
+        obj_scores = F.softmax(bbox_preds['obj_scores'], dim=-1)[..., -1]
+        sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1)
+        bbox3d = self.bbox_coder.decode(bbox_preds)
+
+        if use_nms:
+            batch_size = bbox3d.shape[0]
+            results = list()
+            for b in range(batch_size):
+                bbox_selected, score_selected, labels = \
+                    self.multiclass_nms_single(obj_scores[b], sem_scores[b],
+                                               bbox3d[b], points[b, ..., :3],
+                                               input_metas[b])
+                bbox = input_metas[b]['box_type_3d'](
+                    bbox_selected,
+                    box_dim=bbox_selected.shape[-1],
+                    with_yaw=self.bbox_coder.with_rot)
+                results.append((bbox, score_selected, labels))
+
+            return results
+        else:
+            return bbox3d
+
+    def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
+                              input_meta):
+        """Multi-class nms in single batch.
+
+        Args:
+            obj_scores (torch.Tensor): Objectness score of bounding boxes.
+            sem_scores (torch.Tensor): semantic class score of bounding boxes.
+            bbox (torch.Tensor): Predicted bounding boxes.
+            points (torch.Tensor): Input points.
+            input_meta (dict): Point cloud and image's meta info.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        bbox = input_meta['box_type_3d'](
+            bbox,
+            box_dim=bbox.shape[-1],
+            with_yaw=self.bbox_coder.with_rot,
+            origin=(0.5, 0.5, 0.5))
+        box_indices = bbox.points_in_boxes_all(points)
+
+        corner3d = bbox.corners
+        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
+        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
+        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
+
+        nonempty_box_mask = box_indices.T.sum(1) > 5
+
+        bbox_classes = torch.argmax(sem_scores, -1)
+        nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
+                                      obj_scores[nonempty_box_mask],
+                                      bbox_classes[nonempty_box_mask],
+                                      self.test_cfg.nms_thr)
+
+        # filter empty boxes and boxes with low score
+        scores_mask = (obj_scores > self.test_cfg.score_thr)
+        nonempty_box_inds = torch.nonzero(
+            nonempty_box_mask, as_tuple=False).flatten()
+        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
+            0, nonempty_box_inds[nms_selected], 1)
+        selected = (nonempty_mask.bool() & scores_mask.bool())
+
+        if self.test_cfg.per_class_proposal:
+            bbox_selected, score_selected, labels = [], [], []
+            for k in range(sem_scores.shape[-1]):
+                bbox_selected.append(bbox[selected].tensor)
+                score_selected.append(obj_scores[selected] *
+                                      sem_scores[selected][:, k])
+                labels.append(
+                    torch.zeros_like(bbox_classes[selected]).fill_(k))
+            bbox_selected = torch.cat(bbox_selected, 0)
+            score_selected = torch.cat(score_selected, 0)
+            labels = torch.cat(labels, 0)
+        else:
+            bbox_selected = bbox[selected].tensor
+            score_selected = obj_scores[selected]
+            labels = bbox_classes[selected]
+
+        return bbox_selected, score_selected, labels
diff --git a/mmdet3d/models/detectors/__init__.py b/mmdet3d/models/detectors/__init__.py
index 1924b12..75e517d 100644
--- a/mmdet3d/models/detectors/__init__.py
+++ b/mmdet3d/models/detectors/__init__.py
@@ -1,27 +1,27 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base import Base3DDetector
-from .centerpoint import CenterPoint
-from .dynamic_voxelnet import DynamicVoxelNet
-from .fcos_mono3d import FCOSMono3D
-from .groupfree3dnet import GroupFree3DNet
-from .h3dnet import H3DNet
-from .imvotenet import ImVoteNet
-from .imvoxelnet import ImVoxelNet
-from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN
-from .mvx_two_stage import MVXTwoStageDetector
-from .parta2 import PartA2
-from .point_rcnn import PointRCNN
-from .sassd import SASSD
-from .single_stage_mono3d import SingleStageMono3DDetector
-from .smoke_mono3d import SMOKEMono3D
-from .ssd3dnet import SSD3DNet
-from .votenet import VoteNet
-from .voxelnet import VoxelNet
-
-__all__ = [
-    'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector',
-    'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet',
-    'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector',
-    'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet', 'PointRCNN', 'SMOKEMono3D',
-    'SASSD'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import Base3DDetector
+from .centerpoint import CenterPoint
+from .dynamic_voxelnet import DynamicVoxelNet
+from .fcos_mono3d import FCOSMono3D
+from .groupfree3dnet import GroupFree3DNet
+from .h3dnet import H3DNet
+from .imvotenet import ImVoteNet
+from .imvoxelnet import ImVoxelNet
+from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN
+from .mvx_two_stage import MVXTwoStageDetector
+from .parta2 import PartA2
+from .point_rcnn import PointRCNN
+from .sassd import SASSD
+from .single_stage_mono3d import SingleStageMono3DDetector
+from .smoke_mono3d import SMOKEMono3D
+from .ssd3dnet import SSD3DNet
+from .votenet import VoteNet
+from .voxelnet import VoxelNet
+
+__all__ = [
+    'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector',
+    'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet',
+    'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector',
+    'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet', 'PointRCNN', 'SMOKEMono3D',
+    'SASSD'
+]
diff --git a/mmdet3d/models/detectors/base.py b/mmdet3d/models/detectors/base.py
index 4985c1d..e508480 100644
--- a/mmdet3d/models/detectors/base.py
+++ b/mmdet3d/models/detectors/base.py
@@ -1,127 +1,127 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from os import path as osp
-
-import mmcv
-import torch
-from mmcv.parallel import DataContainer as DC
-from mmcv.runner import auto_fp16
-
-from mmdet3d.core import Box3DMode, Coord3DMode, show_result
-from mmdet.models.detectors import BaseDetector
-
-
-class Base3DDetector(BaseDetector):
-    """Base class for detectors."""
-
-    def forward_test(self, points, img_metas, img=None, **kwargs):
-        """
-        Args:
-            points (list[torch.Tensor]): the outer list indicates test-time
-                augmentations and inner torch.Tensor should have a shape NxC,
-                which contains all points in the batch.
-            img_metas (list[list[dict]]): the outer list indicates test-time
-                augs (multiscale, flip, etc.) and the inner list indicates
-                images in a batch
-            img (list[torch.Tensor], optional): the outer
-                list indicates test-time augmentations and inner
-                torch.Tensor should have a shape NxCxHxW, which contains
-                all images in the batch. Defaults to None.
-        """
-        for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
-            if not isinstance(var, list):
-                raise TypeError('{} must be a list, but got {}'.format(
-                    name, type(var)))
-
-        num_augs = len(points)
-        if num_augs != len(img_metas):
-            raise ValueError(
-                'num of augmentations ({}) != num of image meta ({})'.format(
-                    len(points), len(img_metas)))
-
-        if num_augs == 1:
-            img = [img] if img is None else img
-            return self.simple_test(points[0], img_metas[0], img[0], **kwargs)
-        else:
-            return self.aug_test(points, img_metas, img, **kwargs)
-
-    @auto_fp16(apply_to=('img', 'points'))
-    def forward(self, return_loss=True, **kwargs):
-        """Calls either forward_train or forward_test depending on whether
-        return_loss=True.
-
-        Note this setting will change the expected inputs. When
-        `return_loss=True`, img and img_metas are single-nested (i.e.
-        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
-        img_metas should be double nested (i.e.  list[torch.Tensor],
-        list[list[dict]]), with the outer list indicating test time
-        augmentations.
-        """
-        if return_loss:
-            return self.forward_train(**kwargs)
-        else:
-            return self.forward_test(**kwargs)
-
-    def show_results(self, data, result, out_dir, show=False, score_thr=None):
-        """Results visualization.
-
-        Args:
-            data (list[dict]): Input points and the information of the sample.
-            result (list[dict]): Prediction results.
-            out_dir (str): Output directory of visualization result.
-            show (bool, optional): Determines whether you are
-                going to show result by open3d.
-                Defaults to False.
-            score_thr (float, optional): Score threshold of bounding boxes.
-                Default to None.
-        """
-        for batch_id in range(len(result)):
-            if isinstance(data['points'][0], DC):
-                points = data['points'][0]._data[0][batch_id].numpy()
-            elif mmcv.is_list_of(data['points'][0], torch.Tensor):
-                points = data['points'][0][batch_id]
-            else:
-                ValueError(f"Unsupported data type {type(data['points'][0])} "
-                           f'for visualization!')
-            if isinstance(data['img_metas'][0], DC):
-                pts_filename = data['img_metas'][0]._data[0][batch_id][
-                    'pts_filename']
-                box_mode_3d = data['img_metas'][0]._data[0][batch_id][
-                    'box_mode_3d']
-            elif mmcv.is_list_of(data['img_metas'][0], dict):
-                pts_filename = data['img_metas'][0][batch_id]['pts_filename']
-                box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d']
-            else:
-                ValueError(
-                    f"Unsupported data type {type(data['img_metas'][0])} "
-                    f'for visualization!')
-            file_name = osp.split(pts_filename)[-1].split('.')[0]
-
-            assert out_dir is not None, 'Expect out_dir, got none.'
-
-            pred_bboxes = result[batch_id]['boxes_3d']
-            pred_labels = result[batch_id]['labels_3d']
-
-            if score_thr is not None:
-                mask = result[batch_id]['scores_3d'] > score_thr
-                pred_bboxes = pred_bboxes[mask]
-                pred_labels = pred_labels[mask]
-
-            # for now we convert points and bbox into depth mode
-            if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d
-                                                  == Box3DMode.LIDAR):
-                points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
-                                                   Coord3DMode.DEPTH)
-                pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d,
-                                                Box3DMode.DEPTH)
-            elif box_mode_3d != Box3DMode.DEPTH:
-                ValueError(
-                    f'Unsupported box_mode_3d {box_mode_3d} for conversion!')
-            pred_bboxes = pred_bboxes.tensor.cpu().numpy()
-            show_result(
-                points,
-                None,
-                pred_bboxes,
-                out_dir,
-                file_name,
-                show=show,
-                pred_labels=pred_labels)
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+
+import mmcv
+import torch
+from mmcv.parallel import DataContainer as DC
+from mmcv.runner import auto_fp16
+
+from mmdet3d.core import Box3DMode, Coord3DMode, show_result
+from mmdet.models.detectors import BaseDetector
+
+
+class Base3DDetector(BaseDetector):
+    """Base class for detectors."""
+
+    def forward_test(self, points, img_metas, img=None, **kwargs):
+        """
+        Args:
+            points (list[torch.Tensor]): the outer list indicates test-time
+                augmentations and inner torch.Tensor should have a shape NxC,
+                which contains all points in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch
+            img (list[torch.Tensor], optional): the outer
+                list indicates test-time augmentations and inner
+                torch.Tensor should have a shape NxCxHxW, which contains
+                all images in the batch. Defaults to None.
+        """
+        for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+
+        num_augs = len(points)
+        if num_augs != len(img_metas):
+            raise ValueError(
+                'num of augmentations ({}) != num of image meta ({})'.format(
+                    len(points), len(img_metas)))
+
+        if num_augs == 1:
+            img = [img] if img is None else img
+            return self.simple_test(points[0], img_metas[0], img[0], **kwargs)
+        else:
+            return self.aug_test(points, img_metas, img, **kwargs)
+
+    @auto_fp16(apply_to=('img', 'points'))
+    def forward(self, return_loss=True, **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+
+    def show_results(self, data, result, out_dir, show=False, score_thr=None):
+        """Results visualization.
+
+        Args:
+            data (list[dict]): Input points and the information of the sample.
+            result (list[dict]): Prediction results.
+            out_dir (str): Output directory of visualization result.
+            show (bool, optional): Determines whether you are
+                going to show result by open3d.
+                Defaults to False.
+            score_thr (float, optional): Score threshold of bounding boxes.
+                Default to None.
+        """
+        for batch_id in range(len(result)):
+            if isinstance(data['points'][0], DC):
+                points = data['points'][0]._data[0][batch_id].numpy()
+            elif mmcv.is_list_of(data['points'][0], torch.Tensor):
+                points = data['points'][0][batch_id]
+            else:
+                ValueError(f"Unsupported data type {type(data['points'][0])} "
+                           f'for visualization!')
+            if isinstance(data['img_metas'][0], DC):
+                pts_filename = data['img_metas'][0]._data[0][batch_id][
+                    'pts_filename']
+                box_mode_3d = data['img_metas'][0]._data[0][batch_id][
+                    'box_mode_3d']
+            elif mmcv.is_list_of(data['img_metas'][0], dict):
+                pts_filename = data['img_metas'][0][batch_id]['pts_filename']
+                box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d']
+            else:
+                ValueError(
+                    f"Unsupported data type {type(data['img_metas'][0])} "
+                    f'for visualization!')
+            file_name = osp.split(pts_filename)[-1].split('.')[0]
+
+            assert out_dir is not None, 'Expect out_dir, got none.'
+
+            pred_bboxes = result[batch_id]['boxes_3d']
+            pred_labels = result[batch_id]['labels_3d']
+
+            if score_thr is not None:
+                mask = result[batch_id]['scores_3d'] > score_thr
+                pred_bboxes = pred_bboxes[mask]
+                pred_labels = pred_labels[mask]
+
+            # for now we convert points and bbox into depth mode
+            if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d
+                                                  == Box3DMode.LIDAR):
+                points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
+                                                   Coord3DMode.DEPTH)
+                pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d,
+                                                Box3DMode.DEPTH)
+            elif box_mode_3d != Box3DMode.DEPTH:
+                ValueError(
+                    f'Unsupported box_mode_3d {box_mode_3d} for conversion!')
+            pred_bboxes = pred_bboxes.tensor.cpu().numpy()
+            show_result(
+                points,
+                None,
+                pred_bboxes,
+                out_dir,
+                file_name,
+                show=show,
+                pred_labels=pred_labels)
diff --git a/mmdet3d/models/detectors/centerpoint.py b/mmdet3d/models/detectors/centerpoint.py
index 290af5b..fc49d03 100644
--- a/mmdet3d/models/detectors/centerpoint.py
+++ b/mmdet3d/models/detectors/centerpoint.py
@@ -1,196 +1,196 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
-from ..builder import DETECTORS
-from .mvx_two_stage import MVXTwoStageDetector
-
-
-@DETECTORS.register_module()
-class CenterPoint(MVXTwoStageDetector):
-    """Base class of Multi-modality VoxelNet."""
-
-    def __init__(self,
-                 pts_voxel_layer=None,
-                 pts_voxel_encoder=None,
-                 pts_middle_encoder=None,
-                 pts_fusion_layer=None,
-                 img_backbone=None,
-                 pts_backbone=None,
-                 img_neck=None,
-                 pts_neck=None,
-                 pts_bbox_head=None,
-                 img_roi_head=None,
-                 img_rpn_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(CenterPoint,
-              self).__init__(pts_voxel_layer, pts_voxel_encoder,
-                             pts_middle_encoder, pts_fusion_layer,
-                             img_backbone, pts_backbone, img_neck, pts_neck,
-                             pts_bbox_head, img_roi_head, img_rpn_head,
-                             train_cfg, test_cfg, pretrained, init_cfg)
-
-    def extract_pts_feat(self, pts, img_feats, img_metas):
-        """Extract features of points."""
-        if not self.with_pts_bbox:
-            return None
-        voxels, num_points, coors = self.voxelize(pts)
-
-        voxel_features = self.pts_voxel_encoder(voxels, num_points, coors)
-        batch_size = coors[-1, 0] + 1
-        x = self.pts_middle_encoder(voxel_features, coors, batch_size)
-        x = self.pts_backbone(x)
-        if self.with_pts_neck:
-            x = self.pts_neck(x)
-        return x
-
-    def forward_pts_train(self,
-                          pts_feats,
-                          gt_bboxes_3d,
-                          gt_labels_3d,
-                          img_metas,
-                          gt_bboxes_ignore=None):
-        """Forward function for point cloud branch.
-
-        Args:
-            pts_feats (list[torch.Tensor]): Features of point cloud branch
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes for each sample.
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
-                boxes of each sampole
-            img_metas (list[dict]): Meta information of samples.
-            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
-                boxes to be ignored. Defaults to None.
-
-        Returns:
-            dict: Losses of each branch.
-        """
-        outs = self.pts_bbox_head(pts_feats)
-        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
-        losses = self.pts_bbox_head.loss(*loss_inputs)
-        return losses
-
-    def simple_test_pts(self, x, img_metas, rescale=False):
-        """Test function of point cloud branch."""
-        outs = self.pts_bbox_head(x)
-        bbox_list = self.pts_bbox_head.get_bboxes(
-            outs, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def aug_test_pts(self, feats, img_metas, rescale=False):
-        """Test function of point cloud branch with augmentaiton.
-
-        The function implementation process is as follows:
-
-            - step 1: map features back for double-flip augmentation.
-            - step 2: merge all features and generate boxes.
-            - step 3: map boxes back for scale augmentation.
-            - step 4: merge results.
-
-        Args:
-            feats (list[torch.Tensor]): Feature of point cloud.
-            img_metas (list[dict]): Meta information of samples.
-            rescale (bool, optional): Whether to rescale bboxes.
-                Default: False.
-
-        Returns:
-            dict: Returned bboxes consists of the following keys:
-
-                - boxes_3d (:obj:`LiDARInstance3DBoxes`): Predicted bboxes.
-                - scores_3d (torch.Tensor): Scores of predicted boxes.
-                - labels_3d (torch.Tensor): Labels of predicted boxes.
-        """
-        # only support aug_test for one sample
-        outs_list = []
-        for x, img_meta in zip(feats, img_metas):
-            outs = self.pts_bbox_head(x)
-            # merge augmented outputs before decoding bboxes
-            for task_id, out in enumerate(outs):
-                for key in out[0].keys():
-                    if img_meta[0]['pcd_horizontal_flip']:
-                        outs[task_id][0][key] = torch.flip(
-                            outs[task_id][0][key], dims=[2])
-                        if key == 'reg':
-                            outs[task_id][0][key][:, 1, ...] = 1 - outs[
-                                task_id][0][key][:, 1, ...]
-                        elif key == 'rot':
-                            outs[task_id][0][
-                                key][:, 0,
-                                     ...] = -outs[task_id][0][key][:, 0, ...]
-                        elif key == 'vel':
-                            outs[task_id][0][
-                                key][:, 1,
-                                     ...] = -outs[task_id][0][key][:, 1, ...]
-                    if img_meta[0]['pcd_vertical_flip']:
-                        outs[task_id][0][key] = torch.flip(
-                            outs[task_id][0][key], dims=[3])
-                        if key == 'reg':
-                            outs[task_id][0][key][:, 0, ...] = 1 - outs[
-                                task_id][0][key][:, 0, ...]
-                        elif key == 'rot':
-                            outs[task_id][0][
-                                key][:, 1,
-                                     ...] = -outs[task_id][0][key][:, 1, ...]
-                        elif key == 'vel':
-                            outs[task_id][0][
-                                key][:, 0,
-                                     ...] = -outs[task_id][0][key][:, 0, ...]
-
-            outs_list.append(outs)
-
-        preds_dicts = dict()
-        scale_img_metas = []
-
-        # concat outputs sharing the same pcd_scale_factor
-        for i, (img_meta, outs) in enumerate(zip(img_metas, outs_list)):
-            pcd_scale_factor = img_meta[0]['pcd_scale_factor']
-            if pcd_scale_factor not in preds_dicts.keys():
-                preds_dicts[pcd_scale_factor] = outs
-                scale_img_metas.append(img_meta)
-            else:
-                for task_id, out in enumerate(outs):
-                    for key in out[0].keys():
-                        preds_dicts[pcd_scale_factor][task_id][0][key] += out[
-                            0][key]
-
-        aug_bboxes = []
-
-        for pcd_scale_factor, preds_dict in preds_dicts.items():
-            for task_id, pred_dict in enumerate(preds_dict):
-                # merge outputs with different flips before decoding bboxes
-                for key in pred_dict[0].keys():
-                    preds_dict[task_id][0][key] /= len(outs_list) / len(
-                        preds_dicts.keys())
-            bbox_list = self.pts_bbox_head.get_bboxes(
-                preds_dict, img_metas[0], rescale=rescale)
-            bbox_list = [
-                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
-                for bboxes, scores, labels in bbox_list
-            ]
-            aug_bboxes.append(bbox_list[0])
-
-        if len(preds_dicts.keys()) > 1:
-            # merge outputs with different scales after decoding bboxes
-            merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, scale_img_metas,
-                                                self.pts_bbox_head.test_cfg)
-            return merged_bboxes
-        else:
-            for key in bbox_list[0].keys():
-                bbox_list[0][key] = bbox_list[0][key].to('cpu')
-            return bbox_list[0]
-
-    def aug_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test function with augmentaiton."""
-        img_feats, pts_feats = self.extract_feats(points, img_metas, imgs)
-        bbox_list = dict()
-        if pts_feats and self.with_pts_bbox:
-            pts_bbox = self.aug_test_pts(pts_feats, img_metas, rescale)
-            bbox_list.update(pts_bbox=pts_bbox)
-        return [bbox_list]
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
+from ..builder import DETECTORS
+from .mvx_two_stage import MVXTwoStageDetector
+
+
+@DETECTORS.register_module()
+class CenterPoint(MVXTwoStageDetector):
+    """Base class of Multi-modality VoxelNet."""
+
+    def __init__(self,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(CenterPoint,
+              self).__init__(pts_voxel_layer, pts_voxel_encoder,
+                             pts_middle_encoder, pts_fusion_layer,
+                             img_backbone, pts_backbone, img_neck, pts_neck,
+                             pts_bbox_head, img_roi_head, img_rpn_head,
+                             train_cfg, test_cfg, pretrained, init_cfg)
+
+    def extract_pts_feat(self, pts, img_feats, img_metas):
+        """Extract features of points."""
+        if not self.with_pts_bbox:
+            return None
+        voxels, num_points, coors = self.voxelize(pts)
+
+        voxel_features = self.pts_voxel_encoder(voxels, num_points, coors)
+        batch_size = coors[-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_metas,
+                          gt_bboxes_ignore=None):
+        """Forward function for point cloud branch.
+
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        outs = self.pts_bbox_head(pts_feats)
+        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
+        losses = self.pts_bbox_head.loss(*loss_inputs)
+        return losses
+
+    def simple_test_pts(self, x, img_metas, rescale=False):
+        """Test function of point cloud branch."""
+        outs = self.pts_bbox_head(x)
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def aug_test_pts(self, feats, img_metas, rescale=False):
+        """Test function of point cloud branch with augmentaiton.
+
+        The function implementation process is as follows:
+
+            - step 1: map features back for double-flip augmentation.
+            - step 2: merge all features and generate boxes.
+            - step 3: map boxes back for scale augmentation.
+            - step 4: merge results.
+
+        Args:
+            feats (list[torch.Tensor]): Feature of point cloud.
+            img_metas (list[dict]): Meta information of samples.
+            rescale (bool, optional): Whether to rescale bboxes.
+                Default: False.
+
+        Returns:
+            dict: Returned bboxes consists of the following keys:
+
+                - boxes_3d (:obj:`LiDARInstance3DBoxes`): Predicted bboxes.
+                - scores_3d (torch.Tensor): Scores of predicted boxes.
+                - labels_3d (torch.Tensor): Labels of predicted boxes.
+        """
+        # only support aug_test for one sample
+        outs_list = []
+        for x, img_meta in zip(feats, img_metas):
+            outs = self.pts_bbox_head(x)
+            # merge augmented outputs before decoding bboxes
+            for task_id, out in enumerate(outs):
+                for key in out[0].keys():
+                    if img_meta[0]['pcd_horizontal_flip']:
+                        outs[task_id][0][key] = torch.flip(
+                            outs[task_id][0][key], dims=[2])
+                        if key == 'reg':
+                            outs[task_id][0][key][:, 1, ...] = 1 - outs[
+                                task_id][0][key][:, 1, ...]
+                        elif key == 'rot':
+                            outs[task_id][0][
+                                key][:, 0,
+                                     ...] = -outs[task_id][0][key][:, 0, ...]
+                        elif key == 'vel':
+                            outs[task_id][0][
+                                key][:, 1,
+                                     ...] = -outs[task_id][0][key][:, 1, ...]
+                    if img_meta[0]['pcd_vertical_flip']:
+                        outs[task_id][0][key] = torch.flip(
+                            outs[task_id][0][key], dims=[3])
+                        if key == 'reg':
+                            outs[task_id][0][key][:, 0, ...] = 1 - outs[
+                                task_id][0][key][:, 0, ...]
+                        elif key == 'rot':
+                            outs[task_id][0][
+                                key][:, 1,
+                                     ...] = -outs[task_id][0][key][:, 1, ...]
+                        elif key == 'vel':
+                            outs[task_id][0][
+                                key][:, 0,
+                                     ...] = -outs[task_id][0][key][:, 0, ...]
+
+            outs_list.append(outs)
+
+        preds_dicts = dict()
+        scale_img_metas = []
+
+        # concat outputs sharing the same pcd_scale_factor
+        for i, (img_meta, outs) in enumerate(zip(img_metas, outs_list)):
+            pcd_scale_factor = img_meta[0]['pcd_scale_factor']
+            if pcd_scale_factor not in preds_dicts.keys():
+                preds_dicts[pcd_scale_factor] = outs
+                scale_img_metas.append(img_meta)
+            else:
+                for task_id, out in enumerate(outs):
+                    for key in out[0].keys():
+                        preds_dicts[pcd_scale_factor][task_id][0][key] += out[
+                            0][key]
+
+        aug_bboxes = []
+
+        for pcd_scale_factor, preds_dict in preds_dicts.items():
+            for task_id, pred_dict in enumerate(preds_dict):
+                # merge outputs with different flips before decoding bboxes
+                for key in pred_dict[0].keys():
+                    preds_dict[task_id][0][key] /= len(outs_list) / len(
+                        preds_dicts.keys())
+            bbox_list = self.pts_bbox_head.get_bboxes(
+                preds_dict, img_metas[0], rescale=rescale)
+            bbox_list = [
+                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
+                for bboxes, scores, labels in bbox_list
+            ]
+            aug_bboxes.append(bbox_list[0])
+
+        if len(preds_dicts.keys()) > 1:
+            # merge outputs with different scales after decoding bboxes
+            merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, scale_img_metas,
+                                                self.pts_bbox_head.test_cfg)
+            return merged_bboxes
+        else:
+            for key in bbox_list[0].keys():
+                bbox_list[0][key] = bbox_list[0][key].to('cpu')
+            return bbox_list[0]
+
+    def aug_test(self, points, img_metas, imgs=None, rescale=False):
+        """Test function with augmentaiton."""
+        img_feats, pts_feats = self.extract_feats(points, img_metas, imgs)
+        bbox_list = dict()
+        if pts_feats and self.with_pts_bbox:
+            pts_bbox = self.aug_test_pts(pts_feats, img_metas, rescale)
+            bbox_list.update(pts_bbox=pts_bbox)
+        return [bbox_list]
diff --git a/mmdet3d/models/detectors/dynamic_voxelnet.py b/mmdet3d/models/detectors/dynamic_voxelnet.py
index c4226ec..717b7ae 100644
--- a/mmdet3d/models/detectors/dynamic_voxelnet.py
+++ b/mmdet3d/models/detectors/dynamic_voxelnet.py
@@ -1,71 +1,71 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.runner import force_fp32
-from torch.nn import functional as F
-
-from ..builder import DETECTORS
-from .voxelnet import VoxelNet
-
-
-@DETECTORS.register_module()
-class DynamicVoxelNet(VoxelNet):
-    r"""VoxelNet using `dynamic voxelization <https://arxiv.org/abs/1910.06528>`_.
-    """
-
-    def __init__(self,
-                 voxel_layer,
-                 voxel_encoder,
-                 middle_encoder,
-                 backbone,
-                 neck=None,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(DynamicVoxelNet, self).__init__(
-            voxel_layer=voxel_layer,
-            voxel_encoder=voxel_encoder,
-            middle_encoder=middle_encoder,
-            backbone=backbone,
-            neck=neck,
-            bbox_head=bbox_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            pretrained=pretrained,
-            init_cfg=init_cfg)
-
-    def extract_feat(self, points, img_metas):
-        """Extract features from points."""
-        voxels, coors = self.voxelize(points)
-        voxel_features, feature_coors = self.voxel_encoder(voxels, coors)
-        batch_size = coors[-1, 0].item() + 1
-        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
-        x = self.backbone(x)
-        if self.with_neck:
-            x = self.neck(x)
-        return x
-
-    @torch.no_grad()
-    @force_fp32()
-    def voxelize(self, points):
-        """Apply dynamic voxelization to points.
-
-        Args:
-            points (list[torch.Tensor]): Points of each sample.
-
-        Returns:
-            tuple[torch.Tensor]: Concatenated points and coordinates.
-        """
-        coors = []
-        # dynamic voxelization only provide a coors mapping
-        for res in points:
-            res_coors = self.voxel_layer(res)
-            coors.append(res_coors)
-        points = torch.cat(points, dim=0)
-        coors_batch = []
-        for i, coor in enumerate(coors):
-            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-            coors_batch.append(coor_pad)
-        coors_batch = torch.cat(coors_batch, dim=0)
-        return points, coors_batch
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import force_fp32
+from torch.nn import functional as F
+
+from ..builder import DETECTORS
+from .voxelnet import VoxelNet
+
+
+@DETECTORS.register_module()
+class DynamicVoxelNet(VoxelNet):
+    r"""VoxelNet using `dynamic voxelization <https://arxiv.org/abs/1910.06528>`_.
+    """
+
+    def __init__(self,
+                 voxel_layer,
+                 voxel_encoder,
+                 middle_encoder,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(DynamicVoxelNet, self).__init__(
+            voxel_layer=voxel_layer,
+            voxel_encoder=voxel_encoder,
+            middle_encoder=middle_encoder,
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def extract_feat(self, points, img_metas):
+        """Extract features from points."""
+        voxels, coors = self.voxelize(points)
+        voxel_features, feature_coors = self.voxel_encoder(voxels, coors)
+        batch_size = coors[-1, 0].item() + 1
+        x = self.middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    @torch.no_grad()
+    @force_fp32()
+    def voxelize(self, points):
+        """Apply dynamic voxelization to points.
+
+        Args:
+            points (list[torch.Tensor]): Points of each sample.
+
+        Returns:
+            tuple[torch.Tensor]: Concatenated points and coordinates.
+        """
+        coors = []
+        # dynamic voxelization only provide a coors mapping
+        for res in points:
+            res_coors = self.voxel_layer(res)
+            coors.append(res_coors)
+        points = torch.cat(points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return points, coors_batch
diff --git a/mmdet3d/models/detectors/fcos_mono3d.py b/mmdet3d/models/detectors/fcos_mono3d.py
index 5baed7b..6d44977 100644
--- a/mmdet3d/models/detectors/fcos_mono3d.py
+++ b/mmdet3d/models/detectors/fcos_mono3d.py
@@ -1,22 +1,22 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ..builder import DETECTORS
-from .single_stage_mono3d import SingleStageMono3DDetector
-
-
-@DETECTORS.register_module()
-class FCOSMono3D(SingleStageMono3DDetector):
-    r"""`FCOS3D <https://arxiv.org/abs/2104.10956>`_ for monocular 3D object detection.
-
-    Currently please refer to our entry on the
-    `leaderboard <https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera>`_.
-    """  # noqa: E501
-
-    def __init__(self,
-                 backbone,
-                 neck,
-                 bbox_head,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None):
-        super(FCOSMono3D, self).__init__(backbone, neck, bbox_head, train_cfg,
-                                         test_cfg, pretrained)
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage_mono3d import SingleStageMono3DDetector
+
+
+@DETECTORS.register_module()
+class FCOSMono3D(SingleStageMono3DDetector):
+    r"""`FCOS3D <https://arxiv.org/abs/2104.10956>`_ for monocular 3D object detection.
+
+    Currently please refer to our entry on the
+    `leaderboard <https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Camera>`_.
+    """  # noqa: E501
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(FCOSMono3D, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                         test_cfg, pretrained)
diff --git a/mmdet3d/models/detectors/groupfree3dnet.py b/mmdet3d/models/detectors/groupfree3dnet.py
index 71bd002..5b82b54 100644
--- a/mmdet3d/models/detectors/groupfree3dnet.py
+++ b/mmdet3d/models/detectors/groupfree3dnet.py
@@ -1,105 +1,105 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
-from ..builder import DETECTORS
-from .single_stage import SingleStage3DDetector
-
-
-@DETECTORS.register_module()
-class GroupFree3DNet(SingleStage3DDetector):
-    """`Group-Free 3D <https://arxiv.org/abs/2104.00678>`_."""
-
-    def __init__(self,
-                 backbone,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None):
-        super(GroupFree3DNet, self).__init__(
-            backbone=backbone,
-            bbox_head=bbox_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            pretrained=pretrained)
-
-    def forward_train(self,
-                      points,
-                      img_metas,
-                      gt_bboxes_3d,
-                      gt_labels_3d,
-                      pts_semantic_mask=None,
-                      pts_instance_mask=None,
-                      gt_bboxes_ignore=None):
-        """Forward of training.
-
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            img_metas (list): Image metas.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
-            pts_semantic_mask (list[torch.Tensor]): point-wise semantic
-                label of each batch.
-            pts_instance_mask (list[torch.Tensor]): point-wise instance
-                label of each batch.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding.
-
-        Returns:
-            dict[str: torch.Tensor]: Losses.
-        """
-        # TODO: refactor votenet series to reduce redundant codes.
-        points_cat = torch.stack(points)
-
-        x = self.extract_feat(points_cat)
-        bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod)
-        loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask,
-                       pts_instance_mask, img_metas)
-        losses = self.bbox_head.loss(
-            bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
-        return losses
-
-    def simple_test(self, points, img_metas, imgs=None, rescale=False):
-        """Forward of testing.
-
-        Args:
-            points (list[torch.Tensor]): Points of each sample.
-            img_metas (list): Image metas.
-            rescale (bool): Whether to rescale results.
-        Returns:
-            list: Predicted 3d boxes.
-        """
-        points_cat = torch.stack(points)
-
-        x = self.extract_feat(points_cat)
-        bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
-        bbox_list = self.bbox_head.get_bboxes(
-            points_cat, bbox_preds, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def aug_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test with augmentation."""
-        points_cat = [torch.stack(pts) for pts in points]
-        feats = self.extract_feats(points_cat, img_metas)
-
-        # only support aug_test for one sample
-        aug_bboxes = []
-        for x, pts_cat, img_meta in zip(feats, points_cat, img_metas):
-            bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
-            bbox_list = self.bbox_head.get_bboxes(
-                pts_cat, bbox_preds, img_meta, rescale=rescale)
-            bbox_list = [
-                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
-                for bboxes, scores, labels in bbox_list
-            ]
-            aug_bboxes.append(bbox_list[0])
-
-        # after merging, bboxes will be rescaled to the original image size
-        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
-                                            self.bbox_head.test_cfg)
-
-        return [merged_bboxes]
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
+from ..builder import DETECTORS
+from .single_stage import SingleStage3DDetector
+
+
+@DETECTORS.register_module()
+class GroupFree3DNet(SingleStage3DDetector):
+    """`Group-Free 3D <https://arxiv.org/abs/2104.00678>`_."""
+
+    def __init__(self,
+                 backbone,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(GroupFree3DNet, self).__init__(
+            backbone=backbone,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained)
+
+    def forward_train(self,
+                      points,
+                      img_metas,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      pts_semantic_mask=None,
+                      pts_instance_mask=None,
+                      gt_bboxes_ignore=None):
+        """Forward of training.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            img_metas (list): Image metas.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
+            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
+            pts_semantic_mask (list[torch.Tensor]): point-wise semantic
+                label of each batch.
+            pts_instance_mask (list[torch.Tensor]): point-wise instance
+                label of each batch.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding.
+
+        Returns:
+            dict[str: torch.Tensor]: Losses.
+        """
+        # TODO: refactor votenet series to reduce redundant codes.
+        points_cat = torch.stack(points)
+
+        x = self.extract_feat(points_cat)
+        bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod)
+        loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask,
+                       pts_instance_mask, img_metas)
+        losses = self.bbox_head.loss(
+            bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, points, img_metas, imgs=None, rescale=False):
+        """Forward of testing.
+
+        Args:
+            points (list[torch.Tensor]): Points of each sample.
+            img_metas (list): Image metas.
+            rescale (bool): Whether to rescale results.
+        Returns:
+            list: Predicted 3d boxes.
+        """
+        points_cat = torch.stack(points)
+
+        x = self.extract_feat(points_cat)
+        bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
+        bbox_list = self.bbox_head.get_bboxes(
+            points_cat, bbox_preds, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def aug_test(self, points, img_metas, imgs=None, rescale=False):
+        """Test with augmentation."""
+        points_cat = [torch.stack(pts) for pts in points]
+        feats = self.extract_feats(points_cat, img_metas)
+
+        # only support aug_test for one sample
+        aug_bboxes = []
+        for x, pts_cat, img_meta in zip(feats, points_cat, img_metas):
+            bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
+            bbox_list = self.bbox_head.get_bboxes(
+                pts_cat, bbox_preds, img_meta, rescale=rescale)
+            bbox_list = [
+                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
+                for bboxes, scores, labels in bbox_list
+            ]
+            aug_bboxes.append(bbox_list[0])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
+                                            self.bbox_head.test_cfg)
+
+        return [merged_bboxes]
diff --git a/mmdet3d/models/detectors/h3dnet.py b/mmdet3d/models/detectors/h3dnet.py
index 033a9a1..b2c4305 100644
--- a/mmdet3d/models/detectors/h3dnet.py
+++ b/mmdet3d/models/detectors/h3dnet.py
@@ -1,176 +1,176 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet3d.core import merge_aug_bboxes_3d
-from ..builder import DETECTORS
-from .two_stage import TwoStage3DDetector
-
-
-@DETECTORS.register_module()
-class H3DNet(TwoStage3DDetector):
-    r"""H3DNet model.
-
-    Please refer to the `paper <https://arxiv.org/abs/2006.05682>`_
-    """
-
-    def __init__(self,
-                 backbone,
-                 neck=None,
-                 rpn_head=None,
-                 roi_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(H3DNet, self).__init__(
-            backbone=backbone,
-            neck=neck,
-            rpn_head=rpn_head,
-            roi_head=roi_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            pretrained=pretrained,
-            init_cfg=init_cfg)
-
-    def forward_train(self,
-                      points,
-                      img_metas,
-                      gt_bboxes_3d,
-                      gt_labels_3d,
-                      pts_semantic_mask=None,
-                      pts_instance_mask=None,
-                      gt_bboxes_ignore=None):
-        """Forward of training.
-
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            img_metas (list): Image metas.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
-            pts_semantic_mask (list[torch.Tensor]): point-wise semantic
-                label of each batch.
-            pts_instance_mask (list[torch.Tensor]): point-wise instance
-                label of each batch.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding.
-
-        Returns:
-            dict: Losses.
-        """
-        points_cat = torch.stack(points)
-
-        feats_dict = self.extract_feat(points_cat)
-        feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]]
-        feats_dict['fp_features'] = [feats_dict['hd_feature']]
-        feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]]
-
-        losses = dict()
-        if self.with_rpn:
-            rpn_outs = self.rpn_head(feats_dict, self.train_cfg.rpn.sample_mod)
-            feats_dict.update(rpn_outs)
-
-            rpn_loss_inputs = (points, gt_bboxes_3d, gt_labels_3d,
-                               pts_semantic_mask, pts_instance_mask, img_metas)
-            rpn_losses = self.rpn_head.loss(
-                rpn_outs,
-                *rpn_loss_inputs,
-                gt_bboxes_ignore=gt_bboxes_ignore,
-                ret_target=True)
-            feats_dict['targets'] = rpn_losses.pop('targets')
-            losses.update(rpn_losses)
-
-            # Generate rpn proposals
-            proposal_cfg = self.train_cfg.get('rpn_proposal',
-                                              self.test_cfg.rpn)
-            proposal_inputs = (points, rpn_outs, img_metas)
-            proposal_list = self.rpn_head.get_bboxes(
-                *proposal_inputs, use_nms=proposal_cfg.use_nms)
-            feats_dict['proposal_list'] = proposal_list
-        else:
-            raise NotImplementedError
-
-        roi_losses = self.roi_head.forward_train(feats_dict, img_metas, points,
-                                                 gt_bboxes_3d, gt_labels_3d,
-                                                 pts_semantic_mask,
-                                                 pts_instance_mask,
-                                                 gt_bboxes_ignore)
-        losses.update(roi_losses)
-
-        return losses
-
-    def simple_test(self, points, img_metas, imgs=None, rescale=False):
-        """Forward of testing.
-
-        Args:
-            points (list[torch.Tensor]): Points of each sample.
-            img_metas (list): Image metas.
-            rescale (bool): Whether to rescale results.
-
-        Returns:
-            list: Predicted 3d boxes.
-        """
-        points_cat = torch.stack(points)
-
-        feats_dict = self.extract_feat(points_cat)
-        feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]]
-        feats_dict['fp_features'] = [feats_dict['hd_feature']]
-        feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]]
-
-        if self.with_rpn:
-            proposal_cfg = self.test_cfg.rpn
-            rpn_outs = self.rpn_head(feats_dict, proposal_cfg.sample_mod)
-            feats_dict.update(rpn_outs)
-            # Generate rpn proposals
-            proposal_list = self.rpn_head.get_bboxes(
-                points, rpn_outs, img_metas, use_nms=proposal_cfg.use_nms)
-            feats_dict['proposal_list'] = proposal_list
-        else:
-            raise NotImplementedError
-
-        return self.roi_head.simple_test(
-            feats_dict, img_metas, points_cat, rescale=rescale)
-
-    def aug_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test with augmentation."""
-        points_cat = [torch.stack(pts) for pts in points]
-        feats_dict = self.extract_feats(points_cat, img_metas)
-        for feat_dict in feats_dict:
-            feat_dict['fp_xyz'] = [feat_dict['fp_xyz_net0'][-1]]
-            feat_dict['fp_features'] = [feat_dict['hd_feature']]
-            feat_dict['fp_indices'] = [feat_dict['fp_indices_net0'][-1]]
-
-        # only support aug_test for one sample
-        aug_bboxes = []
-        for feat_dict, pts_cat, img_meta in zip(feats_dict, points_cat,
-                                                img_metas):
-            if self.with_rpn:
-                proposal_cfg = self.test_cfg.rpn
-                rpn_outs = self.rpn_head(feat_dict, proposal_cfg.sample_mod)
-                feat_dict.update(rpn_outs)
-                # Generate rpn proposals
-                proposal_list = self.rpn_head.get_bboxes(
-                    points, rpn_outs, img_metas, use_nms=proposal_cfg.use_nms)
-                feat_dict['proposal_list'] = proposal_list
-            else:
-                raise NotImplementedError
-
-            bbox_results = self.roi_head.simple_test(
-                feat_dict,
-                self.test_cfg.rcnn.sample_mod,
-                img_meta,
-                pts_cat,
-                rescale=rescale)
-            aug_bboxes.append(bbox_results)
-
-        # after merging, bboxes will be rescaled to the original image size
-        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
-                                            self.bbox_head.test_cfg)
-
-        return [merged_bboxes]
-
-    def extract_feats(self, points, img_metas):
-        """Extract features of multiple samples."""
-        return [
-            self.extract_feat(pts, img_meta)
-            for pts, img_meta in zip(points, img_metas)
-        ]
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.core import merge_aug_bboxes_3d
+from ..builder import DETECTORS
+from .two_stage import TwoStage3DDetector
+
+
+@DETECTORS.register_module()
+class H3DNet(TwoStage3DDetector):
+    r"""H3DNet model.
+
+    Please refer to the `paper <https://arxiv.org/abs/2006.05682>`_
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 roi_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(H3DNet, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def forward_train(self,
+                      points,
+                      img_metas,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      pts_semantic_mask=None,
+                      pts_instance_mask=None,
+                      gt_bboxes_ignore=None):
+        """Forward of training.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            img_metas (list): Image metas.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
+            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
+            pts_semantic_mask (list[torch.Tensor]): point-wise semantic
+                label of each batch.
+            pts_instance_mask (list[torch.Tensor]): point-wise instance
+                label of each batch.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding.
+
+        Returns:
+            dict: Losses.
+        """
+        points_cat = torch.stack(points)
+
+        feats_dict = self.extract_feat(points_cat)
+        feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]]
+        feats_dict['fp_features'] = [feats_dict['hd_feature']]
+        feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]]
+
+        losses = dict()
+        if self.with_rpn:
+            rpn_outs = self.rpn_head(feats_dict, self.train_cfg.rpn.sample_mod)
+            feats_dict.update(rpn_outs)
+
+            rpn_loss_inputs = (points, gt_bboxes_3d, gt_labels_3d,
+                               pts_semantic_mask, pts_instance_mask, img_metas)
+            rpn_losses = self.rpn_head.loss(
+                rpn_outs,
+                *rpn_loss_inputs,
+                gt_bboxes_ignore=gt_bboxes_ignore,
+                ret_target=True)
+            feats_dict['targets'] = rpn_losses.pop('targets')
+            losses.update(rpn_losses)
+
+            # Generate rpn proposals
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            proposal_inputs = (points, rpn_outs, img_metas)
+            proposal_list = self.rpn_head.get_bboxes(
+                *proposal_inputs, use_nms=proposal_cfg.use_nms)
+            feats_dict['proposal_list'] = proposal_list
+        else:
+            raise NotImplementedError
+
+        roi_losses = self.roi_head.forward_train(feats_dict, img_metas, points,
+                                                 gt_bboxes_3d, gt_labels_3d,
+                                                 pts_semantic_mask,
+                                                 pts_instance_mask,
+                                                 gt_bboxes_ignore)
+        losses.update(roi_losses)
+
+        return losses
+
+    def simple_test(self, points, img_metas, imgs=None, rescale=False):
+        """Forward of testing.
+
+        Args:
+            points (list[torch.Tensor]): Points of each sample.
+            img_metas (list): Image metas.
+            rescale (bool): Whether to rescale results.
+
+        Returns:
+            list: Predicted 3d boxes.
+        """
+        points_cat = torch.stack(points)
+
+        feats_dict = self.extract_feat(points_cat)
+        feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]]
+        feats_dict['fp_features'] = [feats_dict['hd_feature']]
+        feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]]
+
+        if self.with_rpn:
+            proposal_cfg = self.test_cfg.rpn
+            rpn_outs = self.rpn_head(feats_dict, proposal_cfg.sample_mod)
+            feats_dict.update(rpn_outs)
+            # Generate rpn proposals
+            proposal_list = self.rpn_head.get_bboxes(
+                points, rpn_outs, img_metas, use_nms=proposal_cfg.use_nms)
+            feats_dict['proposal_list'] = proposal_list
+        else:
+            raise NotImplementedError
+
+        return self.roi_head.simple_test(
+            feats_dict, img_metas, points_cat, rescale=rescale)
+
+    def aug_test(self, points, img_metas, imgs=None, rescale=False):
+        """Test with augmentation."""
+        points_cat = [torch.stack(pts) for pts in points]
+        feats_dict = self.extract_feats(points_cat, img_metas)
+        for feat_dict in feats_dict:
+            feat_dict['fp_xyz'] = [feat_dict['fp_xyz_net0'][-1]]
+            feat_dict['fp_features'] = [feat_dict['hd_feature']]
+            feat_dict['fp_indices'] = [feat_dict['fp_indices_net0'][-1]]
+
+        # only support aug_test for one sample
+        aug_bboxes = []
+        for feat_dict, pts_cat, img_meta in zip(feats_dict, points_cat,
+                                                img_metas):
+            if self.with_rpn:
+                proposal_cfg = self.test_cfg.rpn
+                rpn_outs = self.rpn_head(feat_dict, proposal_cfg.sample_mod)
+                feat_dict.update(rpn_outs)
+                # Generate rpn proposals
+                proposal_list = self.rpn_head.get_bboxes(
+                    points, rpn_outs, img_metas, use_nms=proposal_cfg.use_nms)
+                feat_dict['proposal_list'] = proposal_list
+            else:
+                raise NotImplementedError
+
+            bbox_results = self.roi_head.simple_test(
+                feat_dict,
+                self.test_cfg.rcnn.sample_mod,
+                img_meta,
+                pts_cat,
+                rescale=rescale)
+            aug_bboxes.append(bbox_results)
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
+                                            self.bbox_head.test_cfg)
+
+        return [merged_bboxes]
+
+    def extract_feats(self, points, img_metas):
+        """Extract features of multiple samples."""
+        return [
+            self.extract_feat(pts, img_meta)
+            for pts, img_meta in zip(points, img_metas)
+        ]
diff --git a/mmdet3d/models/detectors/imvotenet.py b/mmdet3d/models/detectors/imvotenet.py
index 9f48b81..8273a68 100644
--- a/mmdet3d/models/detectors/imvotenet.py
+++ b/mmdet3d/models/detectors/imvotenet.py
@@ -1,819 +1,819 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import numpy as np
-import torch
-
-from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
-from mmdet3d.models.utils import MLP
-from .. import builder
-from ..builder import DETECTORS
-from .base import Base3DDetector
-
-
-def sample_valid_seeds(mask, num_sampled_seed=1024):
-    r"""Randomly sample seeds from all imvotes.
-
-    Modified from `<https://github.com/facebookresearch/imvotenet/blob/a8856345146bacf29a57266a2f0b874406fd8823/models/imvotenet.py#L26>`_
-
-    Args:
-        mask (torch.Tensor): Bool tensor in shape (
-            seed_num*max_imvote_per_pixel), indicates
-            whether this imvote corresponds to a 2D bbox.
-        num_sampled_seed (int): How many to sample from all imvotes.
-
-    Returns:
-        torch.Tensor: Indices with shape (num_sampled_seed).
-    """  # noqa: E501
-    device = mask.device
-    batch_size = mask.shape[0]
-    sample_inds = mask.new_zeros((batch_size, num_sampled_seed),
-                                 dtype=torch.int64)
-    for bidx in range(batch_size):
-        # return index of non zero elements
-        valid_inds = torch.nonzero(mask[bidx, :]).squeeze(-1)
-        if len(valid_inds) < num_sampled_seed:
-            # compute set t1 - t2
-            t1 = torch.arange(num_sampled_seed, device=device)
-            t2 = valid_inds % num_sampled_seed
-            combined = torch.cat((t1, t2))
-            uniques, counts = combined.unique(return_counts=True)
-            difference = uniques[counts == 1]
-
-            rand_inds = torch.randperm(
-                len(difference),
-                device=device)[:num_sampled_seed - len(valid_inds)]
-            cur_sample_inds = difference[rand_inds]
-            cur_sample_inds = torch.cat((valid_inds, cur_sample_inds))
-        else:
-            rand_inds = torch.randperm(
-                len(valid_inds), device=device)[:num_sampled_seed]
-            cur_sample_inds = valid_inds[rand_inds]
-        sample_inds[bidx, :] = cur_sample_inds
-    return sample_inds
-
-
-@DETECTORS.register_module()
-class ImVoteNet(Base3DDetector):
-    r"""`ImVoteNet <https://arxiv.org/abs/2001.10692>`_ for 3D detection."""
-
-    def __init__(self,
-                 pts_backbone=None,
-                 pts_bbox_heads=None,
-                 pts_neck=None,
-                 img_backbone=None,
-                 img_neck=None,
-                 img_roi_head=None,
-                 img_rpn_head=None,
-                 img_mlp=None,
-                 freeze_img_branch=False,
-                 fusion_layer=None,
-                 num_sampled_seed=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-
-        super(ImVoteNet, self).__init__(init_cfg=init_cfg)
-
-        # point branch
-        if pts_backbone is not None:
-            self.pts_backbone = builder.build_backbone(pts_backbone)
-        if pts_neck is not None:
-            self.pts_neck = builder.build_neck(pts_neck)
-        if pts_bbox_heads is not None:
-            pts_bbox_head_common = pts_bbox_heads.common
-            pts_bbox_head_common.update(
-                train_cfg=train_cfg.pts if train_cfg is not None else None)
-            pts_bbox_head_common.update(test_cfg=test_cfg.pts)
-            pts_bbox_head_joint = pts_bbox_head_common.copy()
-            pts_bbox_head_joint.update(pts_bbox_heads.joint)
-            pts_bbox_head_pts = pts_bbox_head_common.copy()
-            pts_bbox_head_pts.update(pts_bbox_heads.pts)
-            pts_bbox_head_img = pts_bbox_head_common.copy()
-            pts_bbox_head_img.update(pts_bbox_heads.img)
-
-            self.pts_bbox_head_joint = builder.build_head(pts_bbox_head_joint)
-            self.pts_bbox_head_pts = builder.build_head(pts_bbox_head_pts)
-            self.pts_bbox_head_img = builder.build_head(pts_bbox_head_img)
-            self.pts_bbox_heads = [
-                self.pts_bbox_head_joint, self.pts_bbox_head_pts,
-                self.pts_bbox_head_img
-            ]
-            self.loss_weights = pts_bbox_heads.loss_weights
-
-        # image branch
-        if img_backbone:
-            self.img_backbone = builder.build_backbone(img_backbone)
-        if img_neck is not None:
-            self.img_neck = builder.build_neck(img_neck)
-        if img_rpn_head is not None:
-            rpn_train_cfg = train_cfg.img_rpn if train_cfg \
-                is not None else None
-            img_rpn_head_ = img_rpn_head.copy()
-            img_rpn_head_.update(
-                train_cfg=rpn_train_cfg, test_cfg=test_cfg.img_rpn)
-            self.img_rpn_head = builder.build_head(img_rpn_head_)
-        if img_roi_head is not None:
-            rcnn_train_cfg = train_cfg.img_rcnn if train_cfg \
-                is not None else None
-            img_roi_head.update(
-                train_cfg=rcnn_train_cfg, test_cfg=test_cfg.img_rcnn)
-            self.img_roi_head = builder.build_head(img_roi_head)
-
-        # fusion
-        if fusion_layer is not None:
-            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
-            self.max_imvote_per_pixel = fusion_layer.max_imvote_per_pixel
-
-        self.freeze_img_branch = freeze_img_branch
-        if freeze_img_branch:
-            self.freeze_img_branch_params()
-
-        if img_mlp is not None:
-            self.img_mlp = MLP(**img_mlp)
-
-        self.num_sampled_seed = num_sampled_seed
-
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-
-        if pretrained is None:
-            img_pretrained = None
-            pts_pretrained = None
-        elif isinstance(pretrained, dict):
-            img_pretrained = pretrained.get('img', None)
-            pts_pretrained = pretrained.get('pts', None)
-        else:
-            raise ValueError(
-                f'pretrained should be a dict, got {type(pretrained)}')
-
-        if self.with_img_backbone:
-            if img_pretrained is not None:
-                warnings.warn('DeprecationWarning: pretrained is a deprecated '
-                              'key, please consider using init_cfg.')
-                self.img_backbone.init_cfg = dict(
-                    type='Pretrained', checkpoint=img_pretrained)
-        if self.with_img_roi_head:
-            if img_pretrained is not None:
-                warnings.warn('DeprecationWarning: pretrained is a deprecated '
-                              'key, please consider using init_cfg.')
-                self.img_roi_head.init_cfg = dict(
-                    type='Pretrained', checkpoint=img_pretrained)
-
-        if self.with_pts_backbone:
-            if img_pretrained is not None:
-                warnings.warn('DeprecationWarning: pretrained is a deprecated '
-                              'key, please consider using init_cfg.')
-                self.pts_backbone.init_cfg = dict(
-                    type='Pretrained', checkpoint=pts_pretrained)
-
-    def freeze_img_branch_params(self):
-        """Freeze all image branch parameters."""
-        if self.with_img_bbox_head:
-            for param in self.img_bbox_head.parameters():
-                param.requires_grad = False
-        if self.with_img_backbone:
-            for param in self.img_backbone.parameters():
-                param.requires_grad = False
-        if self.with_img_neck:
-            for param in self.img_neck.parameters():
-                param.requires_grad = False
-        if self.with_img_rpn:
-            for param in self.img_rpn_head.parameters():
-                param.requires_grad = False
-        if self.with_img_roi_head:
-            for param in self.img_roi_head.parameters():
-                param.requires_grad = False
-
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
-        """Overload in order to load img network ckpts into img branch."""
-        module_names = ['backbone', 'neck', 'roi_head', 'rpn_head']
-        for key in list(state_dict):
-            for module_name in module_names:
-                if key.startswith(module_name) and ('img_' +
-                                                    key) not in state_dict:
-                    state_dict['img_' + key] = state_dict.pop(key)
-
-        super()._load_from_state_dict(state_dict, prefix, local_metadata,
-                                      strict, missing_keys, unexpected_keys,
-                                      error_msgs)
-
-    def train(self, mode=True):
-        """Overload in order to keep image branch modules in eval mode."""
-        super(ImVoteNet, self).train(mode)
-        if self.freeze_img_branch:
-            if self.with_img_bbox_head:
-                self.img_bbox_head.eval()
-            if self.with_img_backbone:
-                self.img_backbone.eval()
-            if self.with_img_neck:
-                self.img_neck.eval()
-            if self.with_img_rpn:
-                self.img_rpn_head.eval()
-            if self.with_img_roi_head:
-                self.img_roi_head.eval()
-
-    @property
-    def with_img_bbox(self):
-        """bool: Whether the detector has a 2D image box head."""
-        return ((hasattr(self, 'img_roi_head') and self.img_roi_head.with_bbox)
-                or (hasattr(self, 'img_bbox_head')
-                    and self.img_bbox_head is not None))
-
-    @property
-    def with_img_bbox_head(self):
-        """bool: Whether the detector has a 2D image box head (not roi)."""
-        return hasattr(self,
-                       'img_bbox_head') and self.img_bbox_head is not None
-
-    @property
-    def with_img_backbone(self):
-        """bool: Whether the detector has a 2D image backbone."""
-        return hasattr(self, 'img_backbone') and self.img_backbone is not None
-
-    @property
-    def with_img_neck(self):
-        """bool: Whether the detector has a neck in image branch."""
-        return hasattr(self, 'img_neck') and self.img_neck is not None
-
-    @property
-    def with_img_rpn(self):
-        """bool: Whether the detector has a 2D RPN in image detector branch."""
-        return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None
-
-    @property
-    def with_img_roi_head(self):
-        """bool: Whether the detector has a RoI Head in image branch."""
-        return hasattr(self, 'img_roi_head') and self.img_roi_head is not None
-
-    @property
-    def with_pts_bbox(self):
-        """bool: Whether the detector has a 3D box head."""
-        return hasattr(self,
-                       'pts_bbox_head') and self.pts_bbox_head is not None
-
-    @property
-    def with_pts_backbone(self):
-        """bool: Whether the detector has a 3D backbone."""
-        return hasattr(self, 'pts_backbone') and self.pts_backbone is not None
-
-    @property
-    def with_pts_neck(self):
-        """bool: Whether the detector has a neck in 3D detector branch."""
-        return hasattr(self, 'pts_neck') and self.pts_neck is not None
-
-    def extract_feat(self, imgs):
-        """Just to inherit from abstract method."""
-        pass
-
-    def extract_img_feat(self, img):
-        """Directly extract features from the img backbone+neck."""
-        x = self.img_backbone(img)
-        if self.with_img_neck:
-            x = self.img_neck(x)
-        return x
-
-    def extract_img_feats(self, imgs):
-        """Extract features from multiple images.
-
-        Args:
-            imgs (list[torch.Tensor]): A list of images. The images are
-                augmented from the same image but in different ways.
-
-        Returns:
-            list[torch.Tensor]: Features of different images
-        """
-
-        assert isinstance(imgs, list)
-        return [self.extract_img_feat(img) for img in imgs]
-
-    def extract_pts_feat(self, pts):
-        """Extract features of points."""
-        x = self.pts_backbone(pts)
-        if self.with_pts_neck:
-            x = self.pts_neck(x)
-
-        seed_points = x['fp_xyz'][-1]
-        seed_features = x['fp_features'][-1]
-        seed_indices = x['fp_indices'][-1]
-
-        return (seed_points, seed_features, seed_indices)
-
-    def extract_pts_feats(self, pts):
-        """Extract features of points from multiple samples."""
-        assert isinstance(pts, list)
-        return [self.extract_pts_feat(pt) for pt in pts]
-
-    @torch.no_grad()
-    def extract_bboxes_2d(self,
-                          img,
-                          img_metas,
-                          train=True,
-                          bboxes_2d=None,
-                          **kwargs):
-        """Extract bounding boxes from 2d detector.
-
-        Args:
-            img (torch.Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): Image meta info.
-            train (bool): train-time or not.
-            bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
-                not supported yet.
-
-        Return:
-            list[torch.Tensor]: a list of processed 2d bounding boxes.
-        """
-        if bboxes_2d is None:
-            x = self.extract_img_feat(img)
-            proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
-            rets = self.img_roi_head.simple_test(
-                x, proposal_list, img_metas, rescale=False)
-
-            rets_processed = []
-            for ret in rets:
-                tmp = np.concatenate(ret, axis=0)
-                sem_class = img.new_zeros((len(tmp)))
-                start = 0
-                for i, bboxes in enumerate(ret):
-                    sem_class[start:start + len(bboxes)] = i
-                    start += len(bboxes)
-                ret = img.new_tensor(tmp)
-
-                # append class index
-                ret = torch.cat([ret, sem_class[:, None]], dim=-1)
-                inds = torch.argsort(ret[:, 4], descending=True)
-                ret = ret.index_select(0, inds)
-
-                # drop half bboxes during training for better generalization
-                if train:
-                    rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
-                    rand_drop = torch.sort(rand_drop)[0]
-                    ret = ret[rand_drop]
-
-                rets_processed.append(ret.float())
-            return rets_processed
-        else:
-            rets_processed = []
-            for ret in bboxes_2d:
-                if len(ret) > 0 and train:
-                    rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
-                    rand_drop = torch.sort(rand_drop)[0]
-                    ret = ret[rand_drop]
-                rets_processed.append(ret.float())
-            return rets_processed
-
-    def forward_train(self,
-                      points=None,
-                      img=None,
-                      img_metas=None,
-                      gt_bboxes=None,
-                      gt_labels=None,
-                      gt_bboxes_ignore=None,
-                      gt_masks=None,
-                      proposals=None,
-                      bboxes_2d=None,
-                      gt_bboxes_3d=None,
-                      gt_labels_3d=None,
-                      pts_semantic_mask=None,
-                      pts_instance_mask=None,
-                      **kwargs):
-        """Forwarding of train for image branch pretrain or stage 2 train.
-
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            img (torch.Tensor): of shape (N, C, H, W) encoding input images.
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): list of image and point cloud meta info
-                dict. For example, keys include 'ori_shape', 'img_norm_cfg',
-                and 'transformation_3d_flow'. For details on the values of
-                the keys see `mmdet/datasets/pipelines/formatting.py:Collect`.
-            gt_bboxes (list[torch.Tensor]): Ground truth bboxes for each image
-                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[torch.Tensor]): class indices for each
-                2d bounding box.
-            gt_bboxes_ignore (list[torch.Tensor]): specify which
-                2d bounding boxes can be ignored when computing the loss.
-            gt_masks (torch.Tensor): true segmentation masks for each
-                2d bbox, used if the architecture supports a segmentation task.
-            proposals: override rpn proposals (2d) with custom proposals.
-                Use when `with_rpn` is False.
-            bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
-                not supported yet.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes.
-            gt_labels_3d (list[torch.Tensor]): gt class labels for 3d bboxes.
-            pts_semantic_mask (list[torch.Tensor]): point-wise semantic
-                label of each batch.
-            pts_instance_mask (list[torch.Tensor]): point-wise instance
-                label of each batch.
-
-        Returns:
-            dict[str, torch.Tensor]: a dictionary of loss components.
-        """
-        if points is None:
-            x = self.extract_img_feat(img)
-            losses = dict()
-
-            # RPN forward and loss
-            if self.with_img_rpn:
-                proposal_cfg = self.train_cfg.get('img_rpn_proposal',
-                                                  self.test_cfg.img_rpn)
-                rpn_losses, proposal_list = self.img_rpn_head.forward_train(
-                    x,
-                    img_metas,
-                    gt_bboxes,
-                    gt_labels=None,
-                    gt_bboxes_ignore=gt_bboxes_ignore,
-                    proposal_cfg=proposal_cfg)
-                losses.update(rpn_losses)
-            else:
-                proposal_list = proposals
-
-            roi_losses = self.img_roi_head.forward_train(
-                x, img_metas, proposal_list, gt_bboxes, gt_labels,
-                gt_bboxes_ignore, gt_masks, **kwargs)
-            losses.update(roi_losses)
-            return losses
-        else:
-            bboxes_2d = self.extract_bboxes_2d(
-                img, img_metas, bboxes_2d=bboxes_2d, **kwargs)
-
-            points = torch.stack(points)
-            seeds_3d, seed_3d_features, seed_indices = \
-                self.extract_pts_feat(points)
-
-            img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
-                                                    img_metas)
-
-            inds = sample_valid_seeds(masks, self.num_sampled_seed)
-            batch_size, img_feat_size = img_features.shape[:2]
-            pts_feat_size = seed_3d_features.shape[1]
-            inds_img = inds.view(batch_size, 1,
-                                 -1).expand(-1, img_feat_size, -1)
-            img_features = img_features.gather(-1, inds_img)
-            inds = inds % inds.shape[1]
-            inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
-            seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
-            inds_seed_feats = inds.view(batch_size, 1,
-                                        -1).expand(-1, pts_feat_size, -1)
-            seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
-            seed_indices = seed_indices.gather(1, inds)
-
-            img_features = self.img_mlp(img_features)
-            fused_features = torch.cat([seed_3d_features, img_features], dim=1)
-
-            feat_dict_joint = dict(
-                seed_points=seeds_3d,
-                seed_features=fused_features,
-                seed_indices=seed_indices)
-            feat_dict_pts = dict(
-                seed_points=seeds_3d,
-                seed_features=seed_3d_features,
-                seed_indices=seed_indices)
-            feat_dict_img = dict(
-                seed_points=seeds_3d,
-                seed_features=img_features,
-                seed_indices=seed_indices)
-
-            loss_inputs = (points, gt_bboxes_3d, gt_labels_3d,
-                           pts_semantic_mask, pts_instance_mask, img_metas)
-            bbox_preds_joints = self.pts_bbox_head_joint(
-                feat_dict_joint, self.train_cfg.pts.sample_mod)
-            bbox_preds_pts = self.pts_bbox_head_pts(
-                feat_dict_pts, self.train_cfg.pts.sample_mod)
-            bbox_preds_img = self.pts_bbox_head_img(
-                feat_dict_img, self.train_cfg.pts.sample_mod)
-            losses_towers = []
-            losses_joint = self.pts_bbox_head_joint.loss(
-                bbox_preds_joints,
-                *loss_inputs,
-                gt_bboxes_ignore=gt_bboxes_ignore)
-            losses_pts = self.pts_bbox_head_pts.loss(
-                bbox_preds_pts,
-                *loss_inputs,
-                gt_bboxes_ignore=gt_bboxes_ignore)
-            losses_img = self.pts_bbox_head_img.loss(
-                bbox_preds_img,
-                *loss_inputs,
-                gt_bboxes_ignore=gt_bboxes_ignore)
-            losses_towers.append(losses_joint)
-            losses_towers.append(losses_pts)
-            losses_towers.append(losses_img)
-            combined_losses = dict()
-            for loss_term in losses_joint:
-                if 'loss' in loss_term:
-                    combined_losses[loss_term] = 0
-                    for i in range(len(losses_towers)):
-                        combined_losses[loss_term] += \
-                            losses_towers[i][loss_term] * \
-                            self.loss_weights[i]
-                else:
-                    # only save the metric of the joint head
-                    # if it is not a loss
-                    combined_losses[loss_term] = \
-                        losses_towers[0][loss_term]
-
-            return combined_losses
-
-    def forward_test(self,
-                     points=None,
-                     img_metas=None,
-                     img=None,
-                     bboxes_2d=None,
-                     **kwargs):
-        """Forwarding of test for image branch pretrain or stage 2 train.
-
-        Args:
-            points (list[list[torch.Tensor]], optional): the outer
-                list indicates test-time augmentations and the inner
-                list contains all points in the batch, where each Tensor
-                should have a shape NxC. Defaults to None.
-            img_metas (list[list[dict]], optional): the outer list
-                indicates test-time augs (multiscale, flip, etc.)
-                and the inner list indicates images in a batch.
-                Defaults to None.
-            img (list[list[torch.Tensor]], optional): the outer
-                list indicates test-time augmentations and inner Tensor
-                should have a shape NxCxHxW, which contains all images
-                in the batch. Defaults to None. Defaults to None.
-            bboxes_2d (list[list[torch.Tensor]], optional):
-                Provided 2d bboxes, not supported yet. Defaults to None.
-
-        Returns:
-            list[list[torch.Tensor]]|list[dict]: Predicted 2d or 3d boxes.
-        """
-        if points is None:
-            for var, name in [(img, 'img'), (img_metas, 'img_metas')]:
-                if not isinstance(var, list):
-                    raise TypeError(
-                        f'{name} must be a list, but got {type(var)}')
-
-            num_augs = len(img)
-            if num_augs != len(img_metas):
-                raise ValueError(f'num of augmentations ({len(img)}) '
-                                 f'!= num of image meta ({len(img_metas)})')
-
-            if num_augs == 1:
-                # proposals (List[List[Tensor]]): the outer list indicates
-                # test-time augs (multiscale, flip, etc.) and the inner list
-                # indicates images in a batch.
-                # The Tensor should have a shape Px4, where P is the number of
-                # proposals.
-                if 'proposals' in kwargs:
-                    kwargs['proposals'] = kwargs['proposals'][0]
-                return self.simple_test_img_only(
-                    img=img[0], img_metas=img_metas[0], **kwargs)
-            else:
-                assert img[0].size(0) == 1, 'aug test does not support ' \
-                                         'inference with batch size ' \
-                                         f'{img[0].size(0)}'
-                # TODO: support test augmentation for predefined proposals
-                assert 'proposals' not in kwargs
-                return self.aug_test_img_only(
-                    img=img, img_metas=img_metas, **kwargs)
-
-        else:
-            for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
-                if not isinstance(var, list):
-                    raise TypeError('{} must be a list, but got {}'.format(
-                        name, type(var)))
-
-            num_augs = len(points)
-            if num_augs != len(img_metas):
-                raise ValueError(
-                    'num of augmentations ({}) != num of image meta ({})'.
-                    format(len(points), len(img_metas)))
-
-            if num_augs == 1:
-                return self.simple_test(
-                    points[0],
-                    img_metas[0],
-                    img[0],
-                    bboxes_2d=bboxes_2d[0] if bboxes_2d is not None else None,
-                    **kwargs)
-            else:
-                return self.aug_test(points, img_metas, img, bboxes_2d,
-                                     **kwargs)
-
-    def simple_test_img_only(self,
-                             img,
-                             img_metas,
-                             proposals=None,
-                             rescale=False):
-        r"""Test without augmentation, image network pretrain. May refer to
-        `<https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py>`_.
-
-        Args:
-            img (torch.Tensor): Should have a shape NxCxHxW, which contains
-                all images in the batch.
-            img_metas (list[dict]):
-            proposals (list[Tensor], optional): override rpn proposals
-                with custom proposals. Defaults to None.
-            rescale (bool, optional): Whether or not rescale bboxes to the
-                original shape of input image. Defaults to False.
-
-        Returns:
-            list[list[torch.Tensor]]: Predicted 2d boxes.
-        """  # noqa: E501
-        assert self.with_img_bbox, 'Img bbox head must be implemented.'
-        assert self.with_img_backbone, 'Img backbone must be implemented.'
-        assert self.with_img_rpn, 'Img rpn must be implemented.'
-        assert self.with_img_roi_head, 'Img roi head must be implemented.'
-
-        x = self.extract_img_feat(img)
-
-        if proposals is None:
-            proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
-        else:
-            proposal_list = proposals
-
-        ret = self.img_roi_head.simple_test(
-            x, proposal_list, img_metas, rescale=rescale)
-
-        return ret
-
-    def simple_test(self,
-                    points=None,
-                    img_metas=None,
-                    img=None,
-                    bboxes_2d=None,
-                    rescale=False,
-                    **kwargs):
-        """Test without augmentation, stage 2.
-
-        Args:
-            points (list[torch.Tensor], optional): Elements in the list
-                should have a shape NxC, the list indicates all point-clouds
-                in the batch. Defaults to None.
-            img_metas (list[dict], optional): List indicates
-                images in a batch. Defaults to None.
-            img (torch.Tensor, optional): Should have a shape NxCxHxW,
-                which contains all images in the batch. Defaults to None.
-            bboxes_2d (list[torch.Tensor], optional):
-                Provided 2d bboxes, not supported yet. Defaults to None.
-            rescale (bool, optional): Whether or not rescale bboxes.
-                Defaults to False.
-
-        Returns:
-            list[dict]: Predicted 3d boxes.
-        """
-        bboxes_2d = self.extract_bboxes_2d(
-            img, img_metas, train=False, bboxes_2d=bboxes_2d, **kwargs)
-
-        points = torch.stack(points)
-        seeds_3d, seed_3d_features, seed_indices = \
-            self.extract_pts_feat(points)
-
-        img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
-                                                img_metas)
-
-        inds = sample_valid_seeds(masks, self.num_sampled_seed)
-        batch_size, img_feat_size = img_features.shape[:2]
-        pts_feat_size = seed_3d_features.shape[1]
-        inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1)
-        img_features = img_features.gather(-1, inds_img)
-        inds = inds % inds.shape[1]
-        inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
-        seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
-        inds_seed_feats = inds.view(batch_size, 1,
-                                    -1).expand(-1, pts_feat_size, -1)
-        seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
-        seed_indices = seed_indices.gather(1, inds)
-
-        img_features = self.img_mlp(img_features)
-
-        fused_features = torch.cat([seed_3d_features, img_features], dim=1)
-
-        feat_dict = dict(
-            seed_points=seeds_3d,
-            seed_features=fused_features,
-            seed_indices=seed_indices)
-        bbox_preds = self.pts_bbox_head_joint(feat_dict,
-                                              self.test_cfg.pts.sample_mod)
-        bbox_list = self.pts_bbox_head_joint.get_bboxes(
-            points, bbox_preds, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def aug_test_img_only(self, img, img_metas, rescale=False):
-        r"""Test function with augmentation, image network pretrain. May refer
-        to `<https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py>`_.
-
-        Args:
-            img (list[list[torch.Tensor]], optional): the outer
-                list indicates test-time augmentations and inner Tensor
-                should have a shape NxCxHxW, which contains all images
-                in the batch. Defaults to None. Defaults to None.
-            img_metas (list[list[dict]], optional): the outer list
-                indicates test-time augs (multiscale, flip, etc.)
-                and the inner list indicates images in a batch.
-                Defaults to None.
-            rescale (bool, optional): Whether or not rescale bboxes to the
-                original shape of input image. If rescale is False, then
-                returned bboxes and masks will fit the scale of imgs[0].
-                Defaults to None.
-
-        Returns:
-            list[list[torch.Tensor]]: Predicted 2d boxes.
-        """  # noqa: E501
-        assert self.with_img_bbox, 'Img bbox head must be implemented.'
-        assert self.with_img_backbone, 'Img backbone must be implemented.'
-        assert self.with_img_rpn, 'Img rpn must be implemented.'
-        assert self.with_img_roi_head, 'Img roi head must be implemented.'
-
-        x = self.extract_img_feats(img)
-        proposal_list = self.img_rpn_head.aug_test_rpn(x, img_metas)
-
-        return self.img_roi_head.aug_test(
-            x, proposal_list, img_metas, rescale=rescale)
-
-    def aug_test(self,
-                 points=None,
-                 img_metas=None,
-                 imgs=None,
-                 bboxes_2d=None,
-                 rescale=False,
-                 **kwargs):
-        """Test function with augmentation, stage 2.
-
-        Args:
-            points (list[list[torch.Tensor]], optional): the outer
-                list indicates test-time augmentations and the inner
-                list contains all points in the batch, where each Tensor
-                should have a shape NxC. Defaults to None.
-            img_metas (list[list[dict]], optional): the outer list
-                indicates test-time augs (multiscale, flip, etc.)
-                and the inner list indicates images in a batch.
-                Defaults to None.
-            imgs (list[list[torch.Tensor]], optional): the outer
-                list indicates test-time augmentations and inner Tensor
-                should have a shape NxCxHxW, which contains all images
-                in the batch. Defaults to None. Defaults to None.
-            bboxes_2d (list[list[torch.Tensor]], optional):
-                Provided 2d bboxes, not supported yet. Defaults to None.
-            rescale (bool, optional): Whether or not rescale bboxes.
-                Defaults to False.
-
-        Returns:
-            list[dict]: Predicted 3d boxes.
-        """
-        points_cat = [torch.stack(pts) for pts in points]
-        feats = self.extract_pts_feats(points_cat, img_metas)
-
-        # only support aug_test for one sample
-        aug_bboxes = []
-        for x, pts_cat, img_meta, bbox_2d, img in zip(feats, points_cat,
-                                                      img_metas, bboxes_2d,
-                                                      imgs):
-
-            bbox_2d = self.extract_bboxes_2d(
-                img, img_metas, train=False, bboxes_2d=bbox_2d, **kwargs)
-
-            seeds_3d, seed_3d_features, seed_indices = x
-
-            img_features, masks = self.fusion_layer(img, bbox_2d, seeds_3d,
-                                                    img_metas)
-
-            inds = sample_valid_seeds(masks, self.num_sampled_seed)
-            batch_size, img_feat_size = img_features.shape[:2]
-            pts_feat_size = seed_3d_features.shape[1]
-            inds_img = inds.view(batch_size, 1,
-                                 -1).expand(-1, img_feat_size, -1)
-            img_features = img_features.gather(-1, inds_img)
-            inds = inds % inds.shape[1]
-            inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
-            seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
-            inds_seed_feats = inds.view(batch_size, 1,
-                                        -1).expand(-1, pts_feat_size, -1)
-            seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
-            seed_indices = seed_indices.gather(1, inds)
-
-            img_features = self.img_mlp(img_features)
-
-            fused_features = torch.cat([seed_3d_features, img_features], dim=1)
-
-            feat_dict = dict(
-                seed_points=seeds_3d,
-                seed_features=fused_features,
-                seed_indices=seed_indices)
-            bbox_preds = self.pts_bbox_head_joint(feat_dict,
-                                                  self.test_cfg.pts.sample_mod)
-            bbox_list = self.pts_bbox_head_joint.get_bboxes(
-                pts_cat, bbox_preds, img_metas, rescale=rescale)
-
-            bbox_list = [
-                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
-                for bboxes, scores, labels in bbox_list
-            ]
-            aug_bboxes.append(bbox_list[0])
-
-        # after merging, bboxes will be rescaled to the original image size
-        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
-                                            self.bbox_head.test_cfg)
-
-        return [merged_bboxes]
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+
+from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
+from mmdet3d.models.utils import MLP
+from .. import builder
+from ..builder import DETECTORS
+from .base import Base3DDetector
+
+
+def sample_valid_seeds(mask, num_sampled_seed=1024):
+    r"""Randomly sample seeds from all imvotes.
+
+    Modified from `<https://github.com/facebookresearch/imvotenet/blob/a8856345146bacf29a57266a2f0b874406fd8823/models/imvotenet.py#L26>`_
+
+    Args:
+        mask (torch.Tensor): Bool tensor in shape (
+            seed_num*max_imvote_per_pixel), indicates
+            whether this imvote corresponds to a 2D bbox.
+        num_sampled_seed (int): How many to sample from all imvotes.
+
+    Returns:
+        torch.Tensor: Indices with shape (num_sampled_seed).
+    """  # noqa: E501
+    device = mask.device
+    batch_size = mask.shape[0]
+    sample_inds = mask.new_zeros((batch_size, num_sampled_seed),
+                                 dtype=torch.int64)
+    for bidx in range(batch_size):
+        # return index of non zero elements
+        valid_inds = torch.nonzero(mask[bidx, :]).squeeze(-1)
+        if len(valid_inds) < num_sampled_seed:
+            # compute set t1 - t2
+            t1 = torch.arange(num_sampled_seed, device=device)
+            t2 = valid_inds % num_sampled_seed
+            combined = torch.cat((t1, t2))
+            uniques, counts = combined.unique(return_counts=True)
+            difference = uniques[counts == 1]
+
+            rand_inds = torch.randperm(
+                len(difference),
+                device=device)[:num_sampled_seed - len(valid_inds)]
+            cur_sample_inds = difference[rand_inds]
+            cur_sample_inds = torch.cat((valid_inds, cur_sample_inds))
+        else:
+            rand_inds = torch.randperm(
+                len(valid_inds), device=device)[:num_sampled_seed]
+            cur_sample_inds = valid_inds[rand_inds]
+        sample_inds[bidx, :] = cur_sample_inds
+    return sample_inds
+
+
+@DETECTORS.register_module()
+class ImVoteNet(Base3DDetector):
+    r"""`ImVoteNet <https://arxiv.org/abs/2001.10692>`_ for 3D detection."""
+
+    def __init__(self,
+                 pts_backbone=None,
+                 pts_bbox_heads=None,
+                 pts_neck=None,
+                 img_backbone=None,
+                 img_neck=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 img_mlp=None,
+                 freeze_img_branch=False,
+                 fusion_layer=None,
+                 num_sampled_seed=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+
+        super(ImVoteNet, self).__init__(init_cfg=init_cfg)
+
+        # point branch
+        if pts_backbone is not None:
+            self.pts_backbone = builder.build_backbone(pts_backbone)
+        if pts_neck is not None:
+            self.pts_neck = builder.build_neck(pts_neck)
+        if pts_bbox_heads is not None:
+            pts_bbox_head_common = pts_bbox_heads.common
+            pts_bbox_head_common.update(
+                train_cfg=train_cfg.pts if train_cfg is not None else None)
+            pts_bbox_head_common.update(test_cfg=test_cfg.pts)
+            pts_bbox_head_joint = pts_bbox_head_common.copy()
+            pts_bbox_head_joint.update(pts_bbox_heads.joint)
+            pts_bbox_head_pts = pts_bbox_head_common.copy()
+            pts_bbox_head_pts.update(pts_bbox_heads.pts)
+            pts_bbox_head_img = pts_bbox_head_common.copy()
+            pts_bbox_head_img.update(pts_bbox_heads.img)
+
+            self.pts_bbox_head_joint = builder.build_head(pts_bbox_head_joint)
+            self.pts_bbox_head_pts = builder.build_head(pts_bbox_head_pts)
+            self.pts_bbox_head_img = builder.build_head(pts_bbox_head_img)
+            self.pts_bbox_heads = [
+                self.pts_bbox_head_joint, self.pts_bbox_head_pts,
+                self.pts_bbox_head_img
+            ]
+            self.loss_weights = pts_bbox_heads.loss_weights
+
+        # image branch
+        if img_backbone:
+            self.img_backbone = builder.build_backbone(img_backbone)
+        if img_neck is not None:
+            self.img_neck = builder.build_neck(img_neck)
+        if img_rpn_head is not None:
+            rpn_train_cfg = train_cfg.img_rpn if train_cfg \
+                is not None else None
+            img_rpn_head_ = img_rpn_head.copy()
+            img_rpn_head_.update(
+                train_cfg=rpn_train_cfg, test_cfg=test_cfg.img_rpn)
+            self.img_rpn_head = builder.build_head(img_rpn_head_)
+        if img_roi_head is not None:
+            rcnn_train_cfg = train_cfg.img_rcnn if train_cfg \
+                is not None else None
+            img_roi_head.update(
+                train_cfg=rcnn_train_cfg, test_cfg=test_cfg.img_rcnn)
+            self.img_roi_head = builder.build_head(img_roi_head)
+
+        # fusion
+        if fusion_layer is not None:
+            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
+            self.max_imvote_per_pixel = fusion_layer.max_imvote_per_pixel
+
+        self.freeze_img_branch = freeze_img_branch
+        if freeze_img_branch:
+            self.freeze_img_branch_params()
+
+        if img_mlp is not None:
+            self.img_mlp = MLP(**img_mlp)
+
+        self.num_sampled_seed = num_sampled_seed
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if pretrained is None:
+            img_pretrained = None
+            pts_pretrained = None
+        elif isinstance(pretrained, dict):
+            img_pretrained = pretrained.get('img', None)
+            pts_pretrained = pretrained.get('pts', None)
+        else:
+            raise ValueError(
+                f'pretrained should be a dict, got {type(pretrained)}')
+
+        if self.with_img_backbone:
+            if img_pretrained is not None:
+                warnings.warn('DeprecationWarning: pretrained is a deprecated '
+                              'key, please consider using init_cfg.')
+                self.img_backbone.init_cfg = dict(
+                    type='Pretrained', checkpoint=img_pretrained)
+        if self.with_img_roi_head:
+            if img_pretrained is not None:
+                warnings.warn('DeprecationWarning: pretrained is a deprecated '
+                              'key, please consider using init_cfg.')
+                self.img_roi_head.init_cfg = dict(
+                    type='Pretrained', checkpoint=img_pretrained)
+
+        if self.with_pts_backbone:
+            if img_pretrained is not None:
+                warnings.warn('DeprecationWarning: pretrained is a deprecated '
+                              'key, please consider using init_cfg.')
+                self.pts_backbone.init_cfg = dict(
+                    type='Pretrained', checkpoint=pts_pretrained)
+
+    def freeze_img_branch_params(self):
+        """Freeze all image branch parameters."""
+        if self.with_img_bbox_head:
+            for param in self.img_bbox_head.parameters():
+                param.requires_grad = False
+        if self.with_img_backbone:
+            for param in self.img_backbone.parameters():
+                param.requires_grad = False
+        if self.with_img_neck:
+            for param in self.img_neck.parameters():
+                param.requires_grad = False
+        if self.with_img_rpn:
+            for param in self.img_rpn_head.parameters():
+                param.requires_grad = False
+        if self.with_img_roi_head:
+            for param in self.img_roi_head.parameters():
+                param.requires_grad = False
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Overload in order to load img network ckpts into img branch."""
+        module_names = ['backbone', 'neck', 'roi_head', 'rpn_head']
+        for key in list(state_dict):
+            for module_name in module_names:
+                if key.startswith(module_name) and ('img_' +
+                                                    key) not in state_dict:
+                    state_dict['img_' + key] = state_dict.pop(key)
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def train(self, mode=True):
+        """Overload in order to keep image branch modules in eval mode."""
+        super(ImVoteNet, self).train(mode)
+        if self.freeze_img_branch:
+            if self.with_img_bbox_head:
+                self.img_bbox_head.eval()
+            if self.with_img_backbone:
+                self.img_backbone.eval()
+            if self.with_img_neck:
+                self.img_neck.eval()
+            if self.with_img_rpn:
+                self.img_rpn_head.eval()
+            if self.with_img_roi_head:
+                self.img_roi_head.eval()
+
+    @property
+    def with_img_bbox(self):
+        """bool: Whether the detector has a 2D image box head."""
+        return ((hasattr(self, 'img_roi_head') and self.img_roi_head.with_bbox)
+                or (hasattr(self, 'img_bbox_head')
+                    and self.img_bbox_head is not None))
+
+    @property
+    def with_img_bbox_head(self):
+        """bool: Whether the detector has a 2D image box head (not roi)."""
+        return hasattr(self,
+                       'img_bbox_head') and self.img_bbox_head is not None
+
+    @property
+    def with_img_backbone(self):
+        """bool: Whether the detector has a 2D image backbone."""
+        return hasattr(self, 'img_backbone') and self.img_backbone is not None
+
+    @property
+    def with_img_neck(self):
+        """bool: Whether the detector has a neck in image branch."""
+        return hasattr(self, 'img_neck') and self.img_neck is not None
+
+    @property
+    def with_img_rpn(self):
+        """bool: Whether the detector has a 2D RPN in image detector branch."""
+        return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None
+
+    @property
+    def with_img_roi_head(self):
+        """bool: Whether the detector has a RoI Head in image branch."""
+        return hasattr(self, 'img_roi_head') and self.img_roi_head is not None
+
+    @property
+    def with_pts_bbox(self):
+        """bool: Whether the detector has a 3D box head."""
+        return hasattr(self,
+                       'pts_bbox_head') and self.pts_bbox_head is not None
+
+    @property
+    def with_pts_backbone(self):
+        """bool: Whether the detector has a 3D backbone."""
+        return hasattr(self, 'pts_backbone') and self.pts_backbone is not None
+
+    @property
+    def with_pts_neck(self):
+        """bool: Whether the detector has a neck in 3D detector branch."""
+        return hasattr(self, 'pts_neck') and self.pts_neck is not None
+
+    def extract_feat(self, imgs):
+        """Just to inherit from abstract method."""
+        pass
+
+    def extract_img_feat(self, img):
+        """Directly extract features from the img backbone+neck."""
+        x = self.img_backbone(img)
+        if self.with_img_neck:
+            x = self.img_neck(x)
+        return x
+
+    def extract_img_feats(self, imgs):
+        """Extract features from multiple images.
+
+        Args:
+            imgs (list[torch.Tensor]): A list of images. The images are
+                augmented from the same image but in different ways.
+
+        Returns:
+            list[torch.Tensor]: Features of different images
+        """
+
+        assert isinstance(imgs, list)
+        return [self.extract_img_feat(img) for img in imgs]
+
+    def extract_pts_feat(self, pts):
+        """Extract features of points."""
+        x = self.pts_backbone(pts)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+
+        seed_points = x['fp_xyz'][-1]
+        seed_features = x['fp_features'][-1]
+        seed_indices = x['fp_indices'][-1]
+
+        return (seed_points, seed_features, seed_indices)
+
+    def extract_pts_feats(self, pts):
+        """Extract features of points from multiple samples."""
+        assert isinstance(pts, list)
+        return [self.extract_pts_feat(pt) for pt in pts]
+
+    @torch.no_grad()
+    def extract_bboxes_2d(self,
+                          img,
+                          img_metas,
+                          train=True,
+                          bboxes_2d=None,
+                          **kwargs):
+        """Extract bounding boxes from 2d detector.
+
+        Args:
+            img (torch.Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): Image meta info.
+            train (bool): train-time or not.
+            bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
+                not supported yet.
+
+        Return:
+            list[torch.Tensor]: a list of processed 2d bounding boxes.
+        """
+        if bboxes_2d is None:
+            x = self.extract_img_feat(img)
+            proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
+            rets = self.img_roi_head.simple_test(
+                x, proposal_list, img_metas, rescale=False)
+
+            rets_processed = []
+            for ret in rets:
+                tmp = np.concatenate(ret, axis=0)
+                sem_class = img.new_zeros((len(tmp)))
+                start = 0
+                for i, bboxes in enumerate(ret):
+                    sem_class[start:start + len(bboxes)] = i
+                    start += len(bboxes)
+                ret = img.new_tensor(tmp)
+
+                # append class index
+                ret = torch.cat([ret, sem_class[:, None]], dim=-1)
+                inds = torch.argsort(ret[:, 4], descending=True)
+                ret = ret.index_select(0, inds)
+
+                # drop half bboxes during training for better generalization
+                if train:
+                    rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
+                    rand_drop = torch.sort(rand_drop)[0]
+                    ret = ret[rand_drop]
+
+                rets_processed.append(ret.float())
+            return rets_processed
+        else:
+            rets_processed = []
+            for ret in bboxes_2d:
+                if len(ret) > 0 and train:
+                    rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
+                    rand_drop = torch.sort(rand_drop)[0]
+                    ret = ret[rand_drop]
+                rets_processed.append(ret.float())
+            return rets_processed
+
+    def forward_train(self,
+                      points=None,
+                      img=None,
+                      img_metas=None,
+                      gt_bboxes=None,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      gt_masks=None,
+                      proposals=None,
+                      bboxes_2d=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      pts_semantic_mask=None,
+                      pts_instance_mask=None,
+                      **kwargs):
+        """Forwarding of train for image branch pretrain or stage 2 train.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            img (torch.Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): list of image and point cloud meta info
+                dict. For example, keys include 'ori_shape', 'img_norm_cfg',
+                and 'transformation_3d_flow'. For details on the values of
+                the keys see `mmdet/datasets/pipelines/formatting.py:Collect`.
+            gt_bboxes (list[torch.Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[torch.Tensor]): class indices for each
+                2d bounding box.
+            gt_bboxes_ignore (list[torch.Tensor]): specify which
+                2d bounding boxes can be ignored when computing the loss.
+            gt_masks (torch.Tensor): true segmentation masks for each
+                2d bbox, used if the architecture supports a segmentation task.
+            proposals: override rpn proposals (2d) with custom proposals.
+                Use when `with_rpn` is False.
+            bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
+                not supported yet.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes.
+            gt_labels_3d (list[torch.Tensor]): gt class labels for 3d bboxes.
+            pts_semantic_mask (list[torch.Tensor]): point-wise semantic
+                label of each batch.
+            pts_instance_mask (list[torch.Tensor]): point-wise instance
+                label of each batch.
+
+        Returns:
+            dict[str, torch.Tensor]: a dictionary of loss components.
+        """
+        if points is None:
+            x = self.extract_img_feat(img)
+            losses = dict()
+
+            # RPN forward and loss
+            if self.with_img_rpn:
+                proposal_cfg = self.train_cfg.get('img_rpn_proposal',
+                                                  self.test_cfg.img_rpn)
+                rpn_losses, proposal_list = self.img_rpn_head.forward_train(
+                    x,
+                    img_metas,
+                    gt_bboxes,
+                    gt_labels=None,
+                    gt_bboxes_ignore=gt_bboxes_ignore,
+                    proposal_cfg=proposal_cfg)
+                losses.update(rpn_losses)
+            else:
+                proposal_list = proposals
+
+            roi_losses = self.img_roi_head.forward_train(
+                x, img_metas, proposal_list, gt_bboxes, gt_labels,
+                gt_bboxes_ignore, gt_masks, **kwargs)
+            losses.update(roi_losses)
+            return losses
+        else:
+            bboxes_2d = self.extract_bboxes_2d(
+                img, img_metas, bboxes_2d=bboxes_2d, **kwargs)
+
+            points = torch.stack(points)
+            seeds_3d, seed_3d_features, seed_indices = \
+                self.extract_pts_feat(points)
+
+            img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
+                                                    img_metas)
+
+            inds = sample_valid_seeds(masks, self.num_sampled_seed)
+            batch_size, img_feat_size = img_features.shape[:2]
+            pts_feat_size = seed_3d_features.shape[1]
+            inds_img = inds.view(batch_size, 1,
+                                 -1).expand(-1, img_feat_size, -1)
+            img_features = img_features.gather(-1, inds_img)
+            inds = inds % inds.shape[1]
+            inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
+            seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
+            inds_seed_feats = inds.view(batch_size, 1,
+                                        -1).expand(-1, pts_feat_size, -1)
+            seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
+            seed_indices = seed_indices.gather(1, inds)
+
+            img_features = self.img_mlp(img_features)
+            fused_features = torch.cat([seed_3d_features, img_features], dim=1)
+
+            feat_dict_joint = dict(
+                seed_points=seeds_3d,
+                seed_features=fused_features,
+                seed_indices=seed_indices)
+            feat_dict_pts = dict(
+                seed_points=seeds_3d,
+                seed_features=seed_3d_features,
+                seed_indices=seed_indices)
+            feat_dict_img = dict(
+                seed_points=seeds_3d,
+                seed_features=img_features,
+                seed_indices=seed_indices)
+
+            loss_inputs = (points, gt_bboxes_3d, gt_labels_3d,
+                           pts_semantic_mask, pts_instance_mask, img_metas)
+            bbox_preds_joints = self.pts_bbox_head_joint(
+                feat_dict_joint, self.train_cfg.pts.sample_mod)
+            bbox_preds_pts = self.pts_bbox_head_pts(
+                feat_dict_pts, self.train_cfg.pts.sample_mod)
+            bbox_preds_img = self.pts_bbox_head_img(
+                feat_dict_img, self.train_cfg.pts.sample_mod)
+            losses_towers = []
+            losses_joint = self.pts_bbox_head_joint.loss(
+                bbox_preds_joints,
+                *loss_inputs,
+                gt_bboxes_ignore=gt_bboxes_ignore)
+            losses_pts = self.pts_bbox_head_pts.loss(
+                bbox_preds_pts,
+                *loss_inputs,
+                gt_bboxes_ignore=gt_bboxes_ignore)
+            losses_img = self.pts_bbox_head_img.loss(
+                bbox_preds_img,
+                *loss_inputs,
+                gt_bboxes_ignore=gt_bboxes_ignore)
+            losses_towers.append(losses_joint)
+            losses_towers.append(losses_pts)
+            losses_towers.append(losses_img)
+            combined_losses = dict()
+            for loss_term in losses_joint:
+                if 'loss' in loss_term:
+                    combined_losses[loss_term] = 0
+                    for i in range(len(losses_towers)):
+                        combined_losses[loss_term] += \
+                            losses_towers[i][loss_term] * \
+                            self.loss_weights[i]
+                else:
+                    # only save the metric of the joint head
+                    # if it is not a loss
+                    combined_losses[loss_term] = \
+                        losses_towers[0][loss_term]
+
+            return combined_losses
+
+    def forward_test(self,
+                     points=None,
+                     img_metas=None,
+                     img=None,
+                     bboxes_2d=None,
+                     **kwargs):
+        """Forwarding of test for image branch pretrain or stage 2 train.
+
+        Args:
+            points (list[list[torch.Tensor]], optional): the outer
+                list indicates test-time augmentations and the inner
+                list contains all points in the batch, where each Tensor
+                should have a shape NxC. Defaults to None.
+            img_metas (list[list[dict]], optional): the outer list
+                indicates test-time augs (multiscale, flip, etc.)
+                and the inner list indicates images in a batch.
+                Defaults to None.
+            img (list[list[torch.Tensor]], optional): the outer
+                list indicates test-time augmentations and inner Tensor
+                should have a shape NxCxHxW, which contains all images
+                in the batch. Defaults to None. Defaults to None.
+            bboxes_2d (list[list[torch.Tensor]], optional):
+                Provided 2d bboxes, not supported yet. Defaults to None.
+
+        Returns:
+            list[list[torch.Tensor]]|list[dict]: Predicted 2d or 3d boxes.
+        """
+        if points is None:
+            for var, name in [(img, 'img'), (img_metas, 'img_metas')]:
+                if not isinstance(var, list):
+                    raise TypeError(
+                        f'{name} must be a list, but got {type(var)}')
+
+            num_augs = len(img)
+            if num_augs != len(img_metas):
+                raise ValueError(f'num of augmentations ({len(img)}) '
+                                 f'!= num of image meta ({len(img_metas)})')
+
+            if num_augs == 1:
+                # proposals (List[List[Tensor]]): the outer list indicates
+                # test-time augs (multiscale, flip, etc.) and the inner list
+                # indicates images in a batch.
+                # The Tensor should have a shape Px4, where P is the number of
+                # proposals.
+                if 'proposals' in kwargs:
+                    kwargs['proposals'] = kwargs['proposals'][0]
+                return self.simple_test_img_only(
+                    img=img[0], img_metas=img_metas[0], **kwargs)
+            else:
+                assert img[0].size(0) == 1, 'aug test does not support ' \
+                                         'inference with batch size ' \
+                                         f'{img[0].size(0)}'
+                # TODO: support test augmentation for predefined proposals
+                assert 'proposals' not in kwargs
+                return self.aug_test_img_only(
+                    img=img, img_metas=img_metas, **kwargs)
+
+        else:
+            for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
+                if not isinstance(var, list):
+                    raise TypeError('{} must be a list, but got {}'.format(
+                        name, type(var)))
+
+            num_augs = len(points)
+            if num_augs != len(img_metas):
+                raise ValueError(
+                    'num of augmentations ({}) != num of image meta ({})'.
+                    format(len(points), len(img_metas)))
+
+            if num_augs == 1:
+                return self.simple_test(
+                    points[0],
+                    img_metas[0],
+                    img[0],
+                    bboxes_2d=bboxes_2d[0] if bboxes_2d is not None else None,
+                    **kwargs)
+            else:
+                return self.aug_test(points, img_metas, img, bboxes_2d,
+                                     **kwargs)
+
+    def simple_test_img_only(self,
+                             img,
+                             img_metas,
+                             proposals=None,
+                             rescale=False):
+        r"""Test without augmentation, image network pretrain. May refer to
+        `<https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py>`_.
+
+        Args:
+            img (torch.Tensor): Should have a shape NxCxHxW, which contains
+                all images in the batch.
+            img_metas (list[dict]):
+            proposals (list[Tensor], optional): override rpn proposals
+                with custom proposals. Defaults to None.
+            rescale (bool, optional): Whether or not rescale bboxes to the
+                original shape of input image. Defaults to False.
+
+        Returns:
+            list[list[torch.Tensor]]: Predicted 2d boxes.
+        """  # noqa: E501
+        assert self.with_img_bbox, 'Img bbox head must be implemented.'
+        assert self.with_img_backbone, 'Img backbone must be implemented.'
+        assert self.with_img_rpn, 'Img rpn must be implemented.'
+        assert self.with_img_roi_head, 'Img roi head must be implemented.'
+
+        x = self.extract_img_feat(img)
+
+        if proposals is None:
+            proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
+        else:
+            proposal_list = proposals
+
+        ret = self.img_roi_head.simple_test(
+            x, proposal_list, img_metas, rescale=rescale)
+
+        return ret
+
+    def simple_test(self,
+                    points=None,
+                    img_metas=None,
+                    img=None,
+                    bboxes_2d=None,
+                    rescale=False,
+                    **kwargs):
+        """Test without augmentation, stage 2.
+
+        Args:
+            points (list[torch.Tensor], optional): Elements in the list
+                should have a shape NxC, the list indicates all point-clouds
+                in the batch. Defaults to None.
+            img_metas (list[dict], optional): List indicates
+                images in a batch. Defaults to None.
+            img (torch.Tensor, optional): Should have a shape NxCxHxW,
+                which contains all images in the batch. Defaults to None.
+            bboxes_2d (list[torch.Tensor], optional):
+                Provided 2d bboxes, not supported yet. Defaults to None.
+            rescale (bool, optional): Whether or not rescale bboxes.
+                Defaults to False.
+
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        bboxes_2d = self.extract_bboxes_2d(
+            img, img_metas, train=False, bboxes_2d=bboxes_2d, **kwargs)
+
+        points = torch.stack(points)
+        seeds_3d, seed_3d_features, seed_indices = \
+            self.extract_pts_feat(points)
+
+        img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
+                                                img_metas)
+
+        inds = sample_valid_seeds(masks, self.num_sampled_seed)
+        batch_size, img_feat_size = img_features.shape[:2]
+        pts_feat_size = seed_3d_features.shape[1]
+        inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1)
+        img_features = img_features.gather(-1, inds_img)
+        inds = inds % inds.shape[1]
+        inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
+        seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
+        inds_seed_feats = inds.view(batch_size, 1,
+                                    -1).expand(-1, pts_feat_size, -1)
+        seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
+        seed_indices = seed_indices.gather(1, inds)
+
+        img_features = self.img_mlp(img_features)
+
+        fused_features = torch.cat([seed_3d_features, img_features], dim=1)
+
+        feat_dict = dict(
+            seed_points=seeds_3d,
+            seed_features=fused_features,
+            seed_indices=seed_indices)
+        bbox_preds = self.pts_bbox_head_joint(feat_dict,
+                                              self.test_cfg.pts.sample_mod)
+        bbox_list = self.pts_bbox_head_joint.get_bboxes(
+            points, bbox_preds, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def aug_test_img_only(self, img, img_metas, rescale=False):
+        r"""Test function with augmentation, image network pretrain. May refer
+        to `<https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py>`_.
+
+        Args:
+            img (list[list[torch.Tensor]], optional): the outer
+                list indicates test-time augmentations and inner Tensor
+                should have a shape NxCxHxW, which contains all images
+                in the batch. Defaults to None. Defaults to None.
+            img_metas (list[list[dict]], optional): the outer list
+                indicates test-time augs (multiscale, flip, etc.)
+                and the inner list indicates images in a batch.
+                Defaults to None.
+            rescale (bool, optional): Whether or not rescale bboxes to the
+                original shape of input image. If rescale is False, then
+                returned bboxes and masks will fit the scale of imgs[0].
+                Defaults to None.
+
+        Returns:
+            list[list[torch.Tensor]]: Predicted 2d boxes.
+        """  # noqa: E501
+        assert self.with_img_bbox, 'Img bbox head must be implemented.'
+        assert self.with_img_backbone, 'Img backbone must be implemented.'
+        assert self.with_img_rpn, 'Img rpn must be implemented.'
+        assert self.with_img_roi_head, 'Img roi head must be implemented.'
+
+        x = self.extract_img_feats(img)
+        proposal_list = self.img_rpn_head.aug_test_rpn(x, img_metas)
+
+        return self.img_roi_head.aug_test(
+            x, proposal_list, img_metas, rescale=rescale)
+
+    def aug_test(self,
+                 points=None,
+                 img_metas=None,
+                 imgs=None,
+                 bboxes_2d=None,
+                 rescale=False,
+                 **kwargs):
+        """Test function with augmentation, stage 2.
+
+        Args:
+            points (list[list[torch.Tensor]], optional): the outer
+                list indicates test-time augmentations and the inner
+                list contains all points in the batch, where each Tensor
+                should have a shape NxC. Defaults to None.
+            img_metas (list[list[dict]], optional): the outer list
+                indicates test-time augs (multiscale, flip, etc.)
+                and the inner list indicates images in a batch.
+                Defaults to None.
+            imgs (list[list[torch.Tensor]], optional): the outer
+                list indicates test-time augmentations and inner Tensor
+                should have a shape NxCxHxW, which contains all images
+                in the batch. Defaults to None. Defaults to None.
+            bboxes_2d (list[list[torch.Tensor]], optional):
+                Provided 2d bboxes, not supported yet. Defaults to None.
+            rescale (bool, optional): Whether or not rescale bboxes.
+                Defaults to False.
+
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        points_cat = [torch.stack(pts) for pts in points]
+        feats = self.extract_pts_feats(points_cat, img_metas)
+
+        # only support aug_test for one sample
+        aug_bboxes = []
+        for x, pts_cat, img_meta, bbox_2d, img in zip(feats, points_cat,
+                                                      img_metas, bboxes_2d,
+                                                      imgs):
+
+            bbox_2d = self.extract_bboxes_2d(
+                img, img_metas, train=False, bboxes_2d=bbox_2d, **kwargs)
+
+            seeds_3d, seed_3d_features, seed_indices = x
+
+            img_features, masks = self.fusion_layer(img, bbox_2d, seeds_3d,
+                                                    img_metas)
+
+            inds = sample_valid_seeds(masks, self.num_sampled_seed)
+            batch_size, img_feat_size = img_features.shape[:2]
+            pts_feat_size = seed_3d_features.shape[1]
+            inds_img = inds.view(batch_size, 1,
+                                 -1).expand(-1, img_feat_size, -1)
+            img_features = img_features.gather(-1, inds_img)
+            inds = inds % inds.shape[1]
+            inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
+            seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
+            inds_seed_feats = inds.view(batch_size, 1,
+                                        -1).expand(-1, pts_feat_size, -1)
+            seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
+            seed_indices = seed_indices.gather(1, inds)
+
+            img_features = self.img_mlp(img_features)
+
+            fused_features = torch.cat([seed_3d_features, img_features], dim=1)
+
+            feat_dict = dict(
+                seed_points=seeds_3d,
+                seed_features=fused_features,
+                seed_indices=seed_indices)
+            bbox_preds = self.pts_bbox_head_joint(feat_dict,
+                                                  self.test_cfg.pts.sample_mod)
+            bbox_list = self.pts_bbox_head_joint.get_bboxes(
+                pts_cat, bbox_preds, img_metas, rescale=rescale)
+
+            bbox_list = [
+                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
+                for bboxes, scores, labels in bbox_list
+            ]
+            aug_bboxes.append(bbox_list[0])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
+                                            self.bbox_head.test_cfg)
+
+        return [merged_bboxes]
diff --git a/mmdet3d/models/detectors/imvoxelnet.py b/mmdet3d/models/detectors/imvoxelnet.py
index ca65b33..8886257 100644
--- a/mmdet3d/models/detectors/imvoxelnet.py
+++ b/mmdet3d/models/detectors/imvoxelnet.py
@@ -1,138 +1,138 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet3d.core import bbox3d2result, build_prior_generator
-from mmdet3d.models.fusion_layers.point_fusion import point_sample
-from mmdet.models.detectors import BaseDetector
-from ..builder import DETECTORS, build_backbone, build_head, build_neck
-
-
-@DETECTORS.register_module()
-class ImVoxelNet(BaseDetector):
-    r"""`ImVoxelNet <https://arxiv.org/abs/2106.01178>`_."""
-
-    def __init__(self,
-                 backbone,
-                 neck,
-                 neck_3d,
-                 bbox_head,
-                 n_voxels,
-                 anchor_generator,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.backbone = build_backbone(backbone)
-        self.neck = build_neck(neck)
-        self.neck_3d = build_neck(neck_3d)
-        bbox_head.update(train_cfg=train_cfg)
-        bbox_head.update(test_cfg=test_cfg)
-        self.bbox_head = build_head(bbox_head)
-        self.n_voxels = n_voxels
-        self.anchor_generator = build_prior_generator(anchor_generator)
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-
-    def extract_feat(self, img, img_metas):
-        """Extract 3d features from the backbone -> fpn -> 3d projection.
-
-        Args:
-            img (torch.Tensor): Input images of shape (N, C_in, H, W).
-            img_metas (list): Image metas.
-
-        Returns:
-            torch.Tensor: of shape (N, C_out, N_x, N_y, N_z)
-        """
-        x = self.backbone(img)
-        x = self.neck(x)[0]
-        points = self.anchor_generator.grid_anchors(
-            [self.n_voxels[::-1]], device=img.device)[0][:, :3]
-        volumes = []
-        for feature, img_meta in zip(x, img_metas):
-            img_scale_factor = (
-                points.new_tensor(img_meta['scale_factor'][:2])
-                if 'scale_factor' in img_meta.keys() else 1)
-            img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False
-            img_crop_offset = (
-                points.new_tensor(img_meta['img_crop_offset'])
-                if 'img_crop_offset' in img_meta.keys() else 0)
-            volume = point_sample(
-                img_meta,
-                img_features=feature[None, ...],
-                points=points,
-                proj_mat=points.new_tensor(img_meta['lidar2img']),
-                coord_type='LIDAR',
-                img_scale_factor=img_scale_factor,
-                img_crop_offset=img_crop_offset,
-                img_flip=img_flip,
-                img_pad_shape=img.shape[-2:],
-                img_shape=img_meta['img_shape'][:2],
-                aligned=False)
-            volumes.append(
-                volume.reshape(self.n_voxels[::-1] + [-1]).permute(3, 2, 1, 0))
-        x = torch.stack(volumes)
-        x = self.neck_3d(x)
-        return x
-
-    def forward_train(self, img, img_metas, gt_bboxes_3d, gt_labels_3d,
-                      **kwargs):
-        """Forward of training.
-
-        Args:
-            img (torch.Tensor): Input images of shape (N, C_in, H, W).
-            img_metas (list): Image metas.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
-
-        Returns:
-            dict[str, torch.Tensor]: A dictionary of loss components.
-        """
-        x = self.extract_feat(img, img_metas)
-        x = self.bbox_head(x)
-        losses = self.bbox_head.loss(*x, gt_bboxes_3d, gt_labels_3d, img_metas)
-        return losses
-
-    def forward_test(self, img, img_metas, **kwargs):
-        """Forward of testing.
-
-        Args:
-            img (torch.Tensor): Input images of shape (N, C_in, H, W).
-            img_metas (list): Image metas.
-
-        Returns:
-            list[dict]: Predicted 3d boxes.
-        """
-        # not supporting aug_test for now
-        return self.simple_test(img, img_metas)
-
-    def simple_test(self, img, img_metas):
-        """Test without augmentations.
-
-        Args:
-            img (torch.Tensor): Input images of shape (N, C_in, H, W).
-            img_metas (list): Image metas.
-
-        Returns:
-            list[dict]: Predicted 3d boxes.
-        """
-        x = self.extract_feat(img, img_metas)
-        x = self.bbox_head(x)
-        bbox_list = self.bbox_head.get_bboxes(*x, img_metas)
-        bbox_results = [
-            bbox3d2result(det_bboxes, det_scores, det_labels)
-            for det_bboxes, det_scores, det_labels in bbox_list
-        ]
-        return bbox_results
-
-    def aug_test(self, imgs, img_metas, **kwargs):
-        """Test with augmentations.
-
-        Args:
-            imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W).
-            img_metas (list): Image metas.
-
-        Returns:
-            list[dict]: Predicted 3d boxes.
-        """
-        raise NotImplementedError
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.core import bbox3d2result, build_prior_generator
+from mmdet3d.models.fusion_layers.point_fusion import point_sample
+from mmdet.models.detectors import BaseDetector
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+
+
+@DETECTORS.register_module()
+class ImVoxelNet(BaseDetector):
+    r"""`ImVoxelNet <https://arxiv.org/abs/2106.01178>`_."""
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 neck_3d,
+                 bbox_head,
+                 n_voxels,
+                 anchor_generator,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.backbone = build_backbone(backbone)
+        self.neck = build_neck(neck)
+        self.neck_3d = build_neck(neck_3d)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = build_head(bbox_head)
+        self.n_voxels = n_voxels
+        self.anchor_generator = build_prior_generator(anchor_generator)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def extract_feat(self, img, img_metas):
+        """Extract 3d features from the backbone -> fpn -> 3d projection.
+
+        Args:
+            img (torch.Tensor): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+
+        Returns:
+            torch.Tensor: of shape (N, C_out, N_x, N_y, N_z)
+        """
+        x = self.backbone(img)
+        x = self.neck(x)[0]
+        points = self.anchor_generator.grid_anchors(
+            [self.n_voxels[::-1]], device=img.device)[0][:, :3]
+        volumes = []
+        for feature, img_meta in zip(x, img_metas):
+            img_scale_factor = (
+                points.new_tensor(img_meta['scale_factor'][:2])
+                if 'scale_factor' in img_meta.keys() else 1)
+            img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False
+            img_crop_offset = (
+                points.new_tensor(img_meta['img_crop_offset'])
+                if 'img_crop_offset' in img_meta.keys() else 0)
+            volume = point_sample(
+                img_meta,
+                img_features=feature[None, ...],
+                points=points,
+                proj_mat=points.new_tensor(img_meta['lidar2img']),
+                coord_type='LIDAR',
+                img_scale_factor=img_scale_factor,
+                img_crop_offset=img_crop_offset,
+                img_flip=img_flip,
+                img_pad_shape=img.shape[-2:],
+                img_shape=img_meta['img_shape'][:2],
+                aligned=False)
+            volumes.append(
+                volume.reshape(self.n_voxels[::-1] + [-1]).permute(3, 2, 1, 0))
+        x = torch.stack(volumes)
+        x = self.neck_3d(x)
+        return x
+
+    def forward_train(self, img, img_metas, gt_bboxes_3d, gt_labels_3d,
+                      **kwargs):
+        """Forward of training.
+
+        Args:
+            img (torch.Tensor): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
+            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
+
+        Returns:
+            dict[str, torch.Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(img, img_metas)
+        x = self.bbox_head(x)
+        losses = self.bbox_head.loss(*x, gt_bboxes_3d, gt_labels_3d, img_metas)
+        return losses
+
+    def forward_test(self, img, img_metas, **kwargs):
+        """Forward of testing.
+
+        Args:
+            img (torch.Tensor): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        # not supporting aug_test for now
+        return self.simple_test(img, img_metas)
+
+    def simple_test(self, img, img_metas):
+        """Test without augmentations.
+
+        Args:
+            img (torch.Tensor): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        x = self.extract_feat(img, img_metas)
+        x = self.bbox_head(x)
+        bbox_list = self.bbox_head.get_bboxes(*x, img_metas)
+        bbox_results = [
+            bbox3d2result(det_bboxes, det_scores, det_labels)
+            for det_bboxes, det_scores, det_labels in bbox_list
+        ]
+        return bbox_results
+
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Test with augmentations.
+
+        Args:
+            imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W).
+            img_metas (list): Image metas.
+
+        Returns:
+            list[dict]: Predicted 3d boxes.
+        """
+        raise NotImplementedError
diff --git a/mmdet3d/models/detectors/mvx_faster_rcnn.py b/mmdet3d/models/detectors/mvx_faster_rcnn.py
index 07efad6..0672d50 100644
--- a/mmdet3d/models/detectors/mvx_faster_rcnn.py
+++ b/mmdet3d/models/detectors/mvx_faster_rcnn.py
@@ -1,61 +1,61 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.runner import force_fp32
-from torch.nn import functional as F
-
-from ..builder import DETECTORS
-from .mvx_two_stage import MVXTwoStageDetector
-
-
-@DETECTORS.register_module()
-class MVXFasterRCNN(MVXTwoStageDetector):
-    """Multi-modality VoxelNet using Faster R-CNN."""
-
-    def __init__(self, **kwargs):
-        super(MVXFasterRCNN, self).__init__(**kwargs)
-
-
-@DETECTORS.register_module()
-class DynamicMVXFasterRCNN(MVXTwoStageDetector):
-    """Multi-modality VoxelNet using Faster R-CNN and dynamic voxelization."""
-
-    def __init__(self, **kwargs):
-        super(DynamicMVXFasterRCNN, self).__init__(**kwargs)
-
-    @torch.no_grad()
-    @force_fp32()
-    def voxelize(self, points):
-        """Apply dynamic voxelization to points.
-
-        Args:
-            points (list[torch.Tensor]): Points of each sample.
-
-        Returns:
-            tuple[torch.Tensor]: Concatenated points and coordinates.
-        """
-        coors = []
-        # dynamic voxelization only provide a coors mapping
-        for res in points:
-            res_coors = self.pts_voxel_layer(res)
-            coors.append(res_coors)
-        points = torch.cat(points, dim=0)
-        coors_batch = []
-        for i, coor in enumerate(coors):
-            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-            coors_batch.append(coor_pad)
-        coors_batch = torch.cat(coors_batch, dim=0)
-        return points, coors_batch
-
-    def extract_pts_feat(self, points, img_feats, img_metas):
-        """Extract point features."""
-        if not self.with_pts_bbox:
-            return None
-        voxels, coors = self.voxelize(points)
-        voxel_features, feature_coors = self.pts_voxel_encoder(
-            voxels, coors, points, img_feats, img_metas)
-        batch_size = coors[-1, 0] + 1
-        x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size)
-        x = self.pts_backbone(x)
-        if self.with_pts_neck:
-            x = self.pts_neck(x)
-        return x
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import force_fp32
+from torch.nn import functional as F
+
+from ..builder import DETECTORS
+from .mvx_two_stage import MVXTwoStageDetector
+
+
+@DETECTORS.register_module()
+class MVXFasterRCNN(MVXTwoStageDetector):
+    """Multi-modality VoxelNet using Faster R-CNN."""
+
+    def __init__(self, **kwargs):
+        super(MVXFasterRCNN, self).__init__(**kwargs)
+
+
+@DETECTORS.register_module()
+class DynamicMVXFasterRCNN(MVXTwoStageDetector):
+    """Multi-modality VoxelNet using Faster R-CNN and dynamic voxelization."""
+
+    def __init__(self, **kwargs):
+        super(DynamicMVXFasterRCNN, self).__init__(**kwargs)
+
+    @torch.no_grad()
+    @force_fp32()
+    def voxelize(self, points):
+        """Apply dynamic voxelization to points.
+
+        Args:
+            points (list[torch.Tensor]): Points of each sample.
+
+        Returns:
+            tuple[torch.Tensor]: Concatenated points and coordinates.
+        """
+        coors = []
+        # dynamic voxelization only provide a coors mapping
+        for res in points:
+            res_coors = self.pts_voxel_layer(res)
+            coors.append(res_coors)
+        points = torch.cat(points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return points, coors_batch
+
+    def extract_pts_feat(self, points, img_feats, img_metas):
+        """Extract point features."""
+        if not self.with_pts_bbox:
+            return None
+        voxels, coors = self.voxelize(points)
+        voxel_features, feature_coors = self.pts_voxel_encoder(
+            voxels, coors, points, img_feats, img_metas)
+        batch_size = coors[-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
diff --git a/mmdet3d/models/detectors/mvx_two_stage.py b/mmdet3d/models/detectors/mvx_two_stage.py
index 1eba10d..af8c7ec 100644
--- a/mmdet3d/models/detectors/mvx_two_stage.py
+++ b/mmdet3d/models/detectors/mvx_two_stage.py
@@ -1,503 +1,503 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from os import path as osp
-
-import mmcv
-import torch
-from mmcv.ops import Voxelization
-from mmcv.parallel import DataContainer as DC
-from mmcv.runner import force_fp32
-from torch.nn import functional as F
-
-from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result,
-                          merge_aug_bboxes_3d, show_result)
-from mmdet.core import multi_apply
-from .. import builder
-from ..builder import DETECTORS
-from .base import Base3DDetector
-
-
-@DETECTORS.register_module()
-class MVXTwoStageDetector(Base3DDetector):
-    """Base class of Multi-modality VoxelNet."""
-
-    def __init__(self,
-                 pts_voxel_layer=None,
-                 pts_voxel_encoder=None,
-                 pts_middle_encoder=None,
-                 pts_fusion_layer=None,
-                 img_backbone=None,
-                 pts_backbone=None,
-                 img_neck=None,
-                 pts_neck=None,
-                 pts_bbox_head=None,
-                 img_roi_head=None,
-                 img_rpn_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(MVXTwoStageDetector, self).__init__(init_cfg=init_cfg)
-
-        if pts_voxel_layer:
-            self.pts_voxel_layer = Voxelization(**pts_voxel_layer)
-        if pts_voxel_encoder:
-            self.pts_voxel_encoder = builder.build_voxel_encoder(
-                pts_voxel_encoder)
-        if pts_middle_encoder:
-            self.pts_middle_encoder = builder.build_middle_encoder(
-                pts_middle_encoder)
-        if pts_backbone:
-            self.pts_backbone = builder.build_backbone(pts_backbone)
-        if pts_fusion_layer:
-            self.pts_fusion_layer = builder.build_fusion_layer(
-                pts_fusion_layer)
-        if pts_neck is not None:
-            self.pts_neck = builder.build_neck(pts_neck)
-        if pts_bbox_head:
-            pts_train_cfg = train_cfg.pts if train_cfg else None
-            pts_bbox_head.update(train_cfg=pts_train_cfg)
-            pts_test_cfg = test_cfg.pts if test_cfg else None
-            pts_bbox_head.update(test_cfg=pts_test_cfg)
-            self.pts_bbox_head = builder.build_head(pts_bbox_head)
-
-        if img_backbone:
-            self.img_backbone = builder.build_backbone(img_backbone)
-        if img_neck is not None:
-            self.img_neck = builder.build_neck(img_neck)
-        if img_rpn_head is not None:
-            self.img_rpn_head = builder.build_head(img_rpn_head)
-        if img_roi_head is not None:
-            self.img_roi_head = builder.build_head(img_roi_head)
-
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-
-        if pretrained is None:
-            img_pretrained = None
-            pts_pretrained = None
-        elif isinstance(pretrained, dict):
-            img_pretrained = pretrained.get('img', None)
-            pts_pretrained = pretrained.get('pts', None)
-        else:
-            raise ValueError(
-                f'pretrained should be a dict, got {type(pretrained)}')
-
-        if self.with_img_backbone:
-            if img_pretrained is not None:
-                warnings.warn('DeprecationWarning: pretrained is a deprecated '
-                              'key, please consider using init_cfg.')
-                self.img_backbone.init_cfg = dict(
-                    type='Pretrained', checkpoint=img_pretrained)
-        if self.with_img_roi_head:
-            if img_pretrained is not None:
-                warnings.warn('DeprecationWarning: pretrained is a deprecated '
-                              'key, please consider using init_cfg.')
-                self.img_roi_head.init_cfg = dict(
-                    type='Pretrained', checkpoint=img_pretrained)
-        if self.with_pts_backbone:
-            if pts_pretrained is not None:
-                warnings.warn('DeprecationWarning: pretrained is a deprecated '
-                              'key, please consider using init_cfg')
-                self.pts_backbone.init_cfg = dict(
-                    type='Pretrained', checkpoint=pts_pretrained)
-
-    @property
-    def with_img_shared_head(self):
-        """bool: Whether the detector has a shared head in image branch."""
-        return hasattr(self,
-                       'img_shared_head') and self.img_shared_head is not None
-
-    @property
-    def with_pts_bbox(self):
-        """bool: Whether the detector has a 3D box head."""
-        return hasattr(self,
-                       'pts_bbox_head') and self.pts_bbox_head is not None
-
-    @property
-    def with_img_bbox(self):
-        """bool: Whether the detector has a 2D image box head."""
-        return hasattr(self,
-                       'img_bbox_head') and self.img_bbox_head is not None
-
-    @property
-    def with_img_backbone(self):
-        """bool: Whether the detector has a 2D image backbone."""
-        return hasattr(self, 'img_backbone') and self.img_backbone is not None
-
-    @property
-    def with_pts_backbone(self):
-        """bool: Whether the detector has a 3D backbone."""
-        return hasattr(self, 'pts_backbone') and self.pts_backbone is not None
-
-    @property
-    def with_fusion(self):
-        """bool: Whether the detector has a fusion layer."""
-        return hasattr(self,
-                       'pts_fusion_layer') and self.fusion_layer is not None
-
-    @property
-    def with_img_neck(self):
-        """bool: Whether the detector has a neck in image branch."""
-        return hasattr(self, 'img_neck') and self.img_neck is not None
-
-    @property
-    def with_pts_neck(self):
-        """bool: Whether the detector has a neck in 3D detector branch."""
-        return hasattr(self, 'pts_neck') and self.pts_neck is not None
-
-    @property
-    def with_img_rpn(self):
-        """bool: Whether the detector has a 2D RPN in image detector branch."""
-        return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None
-
-    @property
-    def with_img_roi_head(self):
-        """bool: Whether the detector has a RoI Head in image branch."""
-        return hasattr(self, 'img_roi_head') and self.img_roi_head is not None
-
-    @property
-    def with_voxel_encoder(self):
-        """bool: Whether the detector has a voxel encoder."""
-        return hasattr(self,
-                       'voxel_encoder') and self.voxel_encoder is not None
-
-    @property
-    def with_middle_encoder(self):
-        """bool: Whether the detector has a middle encoder."""
-        return hasattr(self,
-                       'middle_encoder') and self.middle_encoder is not None
-
-    def extract_img_feat(self, img, img_metas):
-        """Extract features of images."""
-        if self.with_img_backbone and img is not None:
-            input_shape = img.shape[-2:]
-            # update real input shape of each single img
-            for img_meta in img_metas:
-                img_meta.update(input_shape=input_shape)
-
-            if img.dim() == 5 and img.size(0) == 1:
-                img.squeeze_()
-            elif img.dim() == 5 and img.size(0) > 1:
-                B, N, C, H, W = img.size()
-                img = img.view(B * N, C, H, W)
-            img_feats = self.img_backbone(img)
-        else:
-            return None
-        if self.with_img_neck:
-            img_feats = self.img_neck(img_feats)
-        return img_feats
-
-    def extract_pts_feat(self, pts, img_feats, img_metas):
-        """Extract features of points."""
-        if not self.with_pts_bbox:
-            return None
-        voxels, num_points, coors = self.voxelize(pts)
-        voxel_features = self.pts_voxel_encoder(voxels, num_points, coors,
-                                                img_feats, img_metas)
-        batch_size = coors[-1, 0] + 1
-        x = self.pts_middle_encoder(voxel_features, coors, batch_size)
-        x = self.pts_backbone(x)
-        if self.with_pts_neck:
-            x = self.pts_neck(x)
-        return x
-
-    def extract_feat(self, points, img, img_metas):
-        """Extract features from images and points."""
-        img_feats = self.extract_img_feat(img, img_metas)
-        pts_feats = self.extract_pts_feat(points, img_feats, img_metas)
-        return (img_feats, pts_feats)
-
-    @torch.no_grad()
-    @force_fp32()
-    def voxelize(self, points):
-        """Apply dynamic voxelization to points.
-
-        Args:
-            points (list[torch.Tensor]): Points of each sample.
-
-        Returns:
-            tuple[torch.Tensor]: Concatenated points, number of points
-                per voxel, and coordinates.
-        """
-        voxels, coors, num_points = [], [], []
-        for res in points:
-            res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
-            voxels.append(res_voxels)
-            coors.append(res_coors)
-            num_points.append(res_num_points)
-        voxels = torch.cat(voxels, dim=0)
-        num_points = torch.cat(num_points, dim=0)
-        coors_batch = []
-        for i, coor in enumerate(coors):
-            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-            coors_batch.append(coor_pad)
-        coors_batch = torch.cat(coors_batch, dim=0)
-        return voxels, num_points, coors_batch
-
-    def forward_train(self,
-                      points=None,
-                      img_metas=None,
-                      gt_bboxes_3d=None,
-                      gt_labels_3d=None,
-                      gt_labels=None,
-                      gt_bboxes=None,
-                      img=None,
-                      proposals=None,
-                      gt_bboxes_ignore=None):
-        """Forward training function.
-
-        Args:
-            points (list[torch.Tensor], optional): Points of each sample.
-                Defaults to None.
-            img_metas (list[dict], optional): Meta information of each sample.
-                Defaults to None.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
-                Ground truth 3D boxes. Defaults to None.
-            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
-                of 3D boxes. Defaults to None.
-            gt_labels (list[torch.Tensor], optional): Ground truth labels
-                of 2D boxes in images. Defaults to None.
-            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
-                images. Defaults to None.
-            img (torch.Tensor, optional): Images of each sample with shape
-                (N, C, H, W). Defaults to None.
-            proposals ([list[torch.Tensor], optional): Predicted proposals
-                used for training Fast RCNN. Defaults to None.
-            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
-                2D boxes in images to be ignored. Defaults to None.
-
-        Returns:
-            dict: Losses of different branches.
-        """
-        img_feats, pts_feats = self.extract_feat(
-            points, img=img, img_metas=img_metas)
-        losses = dict()
-        if pts_feats:
-            losses_pts = self.forward_pts_train(pts_feats, gt_bboxes_3d,
-                                                gt_labels_3d, img_metas,
-                                                gt_bboxes_ignore)
-            losses.update(losses_pts)
-        if img_feats:
-            losses_img = self.forward_img_train(
-                img_feats,
-                img_metas=img_metas,
-                gt_bboxes=gt_bboxes,
-                gt_labels=gt_labels,
-                gt_bboxes_ignore=gt_bboxes_ignore,
-                proposals=proposals)
-            losses.update(losses_img)
-        return losses
-
-    def forward_pts_train(self,
-                          pts_feats,
-                          gt_bboxes_3d,
-                          gt_labels_3d,
-                          img_metas,
-                          gt_bboxes_ignore=None):
-        """Forward function for point cloud branch.
-
-        Args:
-            pts_feats (list[torch.Tensor]): Features of point cloud branch
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes for each sample.
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
-                boxes of each sampole
-            img_metas (list[dict]): Meta information of samples.
-            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
-                boxes to be ignored. Defaults to None.
-
-        Returns:
-            dict: Losses of each branch.
-        """
-        outs = self.pts_bbox_head(pts_feats)
-        loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas)
-        losses = self.pts_bbox_head.loss(
-            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
-        return losses
-
-    def forward_img_train(self,
-                          x,
-                          img_metas,
-                          gt_bboxes,
-                          gt_labels,
-                          gt_bboxes_ignore=None,
-                          proposals=None,
-                          **kwargs):
-        """Forward function for image branch.
-
-        This function works similar to the forward function of Faster R-CNN.
-
-        Args:
-            x (list[torch.Tensor]): Image features of shape (B, C, H, W)
-                of multiple levels.
-            img_metas (list[dict]): Meta information of images.
-            gt_bboxes (list[torch.Tensor]): Ground truth boxes of each image
-                sample.
-            gt_labels (list[torch.Tensor]): Ground truth labels of boxes.
-            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
-                boxes to be ignored. Defaults to None.
-            proposals (list[torch.Tensor], optional): Proposals of each sample.
-                Defaults to None.
-
-        Returns:
-            dict: Losses of each branch.
-        """
-        losses = dict()
-        # RPN forward and loss
-        if self.with_img_rpn:
-            rpn_outs = self.img_rpn_head(x)
-            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas,
-                                          self.train_cfg.img_rpn)
-            rpn_losses = self.img_rpn_head.loss(
-                *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
-            losses.update(rpn_losses)
-
-            proposal_cfg = self.train_cfg.get('img_rpn_proposal',
-                                              self.test_cfg.img_rpn)
-            proposal_inputs = rpn_outs + (img_metas, proposal_cfg)
-            proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
-        else:
-            proposal_list = proposals
-
-        # bbox head forward and loss
-        if self.with_img_bbox:
-            # bbox head forward and loss
-            img_roi_losses = self.img_roi_head.forward_train(
-                x, img_metas, proposal_list, gt_bboxes, gt_labels,
-                gt_bboxes_ignore, **kwargs)
-            losses.update(img_roi_losses)
-
-        return losses
-
-    def simple_test_img(self, x, img_metas, proposals=None, rescale=False):
-        """Test without augmentation."""
-        if proposals is None:
-            proposal_list = self.simple_test_rpn(x, img_metas,
-                                                 self.test_cfg.img_rpn)
-        else:
-            proposal_list = proposals
-
-        return self.img_roi_head.simple_test(
-            x, proposal_list, img_metas, rescale=rescale)
-
-    def simple_test_rpn(self, x, img_metas, rpn_test_cfg):
-        """RPN test function."""
-        rpn_outs = self.img_rpn_head(x)
-        proposal_inputs = rpn_outs + (img_metas, rpn_test_cfg)
-        proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
-        return proposal_list
-
-    def simple_test_pts(self, x, img_metas, rescale=False):
-        """Test function of point cloud branch."""
-        outs = self.pts_bbox_head(x)
-        bbox_list = self.pts_bbox_head.get_bboxes(
-            *outs, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def simple_test(self, points, img_metas, img=None, rescale=False):
-        """Test function without augmentaiton."""
-        img_feats, pts_feats = self.extract_feat(
-            points, img=img, img_metas=img_metas)
-
-        bbox_list = [dict() for i in range(len(img_metas))]
-        if pts_feats and self.with_pts_bbox:
-            bbox_pts = self.simple_test_pts(
-                pts_feats, img_metas, rescale=rescale)
-            for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
-                result_dict['pts_bbox'] = pts_bbox
-        if img_feats and self.with_img_bbox:
-            bbox_img = self.simple_test_img(
-                img_feats, img_metas, rescale=rescale)
-            for result_dict, img_bbox in zip(bbox_list, bbox_img):
-                result_dict['img_bbox'] = img_bbox
-        return bbox_list
-
-    def aug_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test function with augmentaiton."""
-        img_feats, pts_feats = self.extract_feats(points, img_metas, imgs)
-
-        bbox_list = dict()
-        if pts_feats and self.with_pts_bbox:
-            bbox_pts = self.aug_test_pts(pts_feats, img_metas, rescale)
-            bbox_list.update(pts_bbox=bbox_pts)
-        return [bbox_list]
-
-    def extract_feats(self, points, img_metas, imgs=None):
-        """Extract point and image features of multiple samples."""
-        if imgs is None:
-            imgs = [None] * len(img_metas)
-        img_feats, pts_feats = multi_apply(self.extract_feat, points, imgs,
-                                           img_metas)
-        return img_feats, pts_feats
-
-    def aug_test_pts(self, feats, img_metas, rescale=False):
-        """Test function of point cloud branch with augmentaiton."""
-        # only support aug_test for one sample
-        aug_bboxes = []
-        for x, img_meta in zip(feats, img_metas):
-            outs = self.pts_bbox_head(x)
-            bbox_list = self.pts_bbox_head.get_bboxes(
-                *outs, img_meta, rescale=rescale)
-            bbox_list = [
-                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
-                for bboxes, scores, labels in bbox_list
-            ]
-            aug_bboxes.append(bbox_list[0])
-
-        # after merging, bboxes will be rescaled to the original image size
-        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
-                                            self.pts_bbox_head.test_cfg)
-        return merged_bboxes
-
-    def show_results(self, data, result, out_dir):
-        """Results visualization.
-
-        Args:
-            data (dict): Input points and the information of the sample.
-            result (dict): Prediction results.
-            out_dir (str): Output directory of visualization result.
-        """
-        for batch_id in range(len(result)):
-            if isinstance(data['points'][0], DC):
-                points = data['points'][0]._data[0][batch_id].numpy()
-            elif mmcv.is_list_of(data['points'][0], torch.Tensor):
-                points = data['points'][0][batch_id]
-            else:
-                ValueError(f"Unsupported data type {type(data['points'][0])} "
-                           f'for visualization!')
-            if isinstance(data['img_metas'][0], DC):
-                pts_filename = data['img_metas'][0]._data[0][batch_id][
-                    'pts_filename']
-                box_mode_3d = data['img_metas'][0]._data[0][batch_id][
-                    'box_mode_3d']
-            elif mmcv.is_list_of(data['img_metas'][0], dict):
-                pts_filename = data['img_metas'][0][batch_id]['pts_filename']
-                box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d']
-            else:
-                ValueError(
-                    f"Unsupported data type {type(data['img_metas'][0])} "
-                    f'for visualization!')
-            file_name = osp.split(pts_filename)[-1].split('.')[0]
-
-            assert out_dir is not None, 'Expect out_dir, got none.'
-            inds = result[batch_id]['pts_bbox']['scores_3d'] > 0.1
-            pred_bboxes = result[batch_id]['pts_bbox']['boxes_3d'][inds]
-
-            # for now we convert points and bbox into depth mode
-            if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d
-                                                  == Box3DMode.LIDAR):
-                points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
-                                                   Coord3DMode.DEPTH)
-                pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d,
-                                                Box3DMode.DEPTH)
-            elif box_mode_3d != Box3DMode.DEPTH:
-                ValueError(
-                    f'Unsupported box_mode_3d {box_mode_3d} for conversion!')
-
-            pred_bboxes = pred_bboxes.tensor.cpu().numpy()
-            show_result(points, None, pred_bboxes, out_dir, file_name)
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from os import path as osp
+
+import mmcv
+import torch
+from mmcv.ops import Voxelization
+from mmcv.parallel import DataContainer as DC
+from mmcv.runner import force_fp32
+from torch.nn import functional as F
+
+from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result,
+                          merge_aug_bboxes_3d, show_result)
+from mmdet.core import multi_apply
+from .. import builder
+from ..builder import DETECTORS
+from .base import Base3DDetector
+
+
+@DETECTORS.register_module()
+class MVXTwoStageDetector(Base3DDetector):
+    """Base class of Multi-modality VoxelNet."""
+
+    def __init__(self,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(MVXTwoStageDetector, self).__init__(init_cfg=init_cfg)
+
+        if pts_voxel_layer:
+            self.pts_voxel_layer = Voxelization(**pts_voxel_layer)
+        if pts_voxel_encoder:
+            self.pts_voxel_encoder = builder.build_voxel_encoder(
+                pts_voxel_encoder)
+        if pts_middle_encoder:
+            self.pts_middle_encoder = builder.build_middle_encoder(
+                pts_middle_encoder)
+        if pts_backbone:
+            self.pts_backbone = builder.build_backbone(pts_backbone)
+        if pts_fusion_layer:
+            self.pts_fusion_layer = builder.build_fusion_layer(
+                pts_fusion_layer)
+        if pts_neck is not None:
+            self.pts_neck = builder.build_neck(pts_neck)
+        if pts_bbox_head:
+            pts_train_cfg = train_cfg.pts if train_cfg else None
+            pts_bbox_head.update(train_cfg=pts_train_cfg)
+            pts_test_cfg = test_cfg.pts if test_cfg else None
+            pts_bbox_head.update(test_cfg=pts_test_cfg)
+            self.pts_bbox_head = builder.build_head(pts_bbox_head)
+
+        if img_backbone:
+            self.img_backbone = builder.build_backbone(img_backbone)
+        if img_neck is not None:
+            self.img_neck = builder.build_neck(img_neck)
+        if img_rpn_head is not None:
+            self.img_rpn_head = builder.build_head(img_rpn_head)
+        if img_roi_head is not None:
+            self.img_roi_head = builder.build_head(img_roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if pretrained is None:
+            img_pretrained = None
+            pts_pretrained = None
+        elif isinstance(pretrained, dict):
+            img_pretrained = pretrained.get('img', None)
+            pts_pretrained = pretrained.get('pts', None)
+        else:
+            raise ValueError(
+                f'pretrained should be a dict, got {type(pretrained)}')
+
+        if self.with_img_backbone:
+            if img_pretrained is not None:
+                warnings.warn('DeprecationWarning: pretrained is a deprecated '
+                              'key, please consider using init_cfg.')
+                self.img_backbone.init_cfg = dict(
+                    type='Pretrained', checkpoint=img_pretrained)
+        if self.with_img_roi_head:
+            if img_pretrained is not None:
+                warnings.warn('DeprecationWarning: pretrained is a deprecated '
+                              'key, please consider using init_cfg.')
+                self.img_roi_head.init_cfg = dict(
+                    type='Pretrained', checkpoint=img_pretrained)
+        if self.with_pts_backbone:
+            if pts_pretrained is not None:
+                warnings.warn('DeprecationWarning: pretrained is a deprecated '
+                              'key, please consider using init_cfg')
+                self.pts_backbone.init_cfg = dict(
+                    type='Pretrained', checkpoint=pts_pretrained)
+
+    @property
+    def with_img_shared_head(self):
+        """bool: Whether the detector has a shared head in image branch."""
+        return hasattr(self,
+                       'img_shared_head') and self.img_shared_head is not None
+
+    @property
+    def with_pts_bbox(self):
+        """bool: Whether the detector has a 3D box head."""
+        return hasattr(self,
+                       'pts_bbox_head') and self.pts_bbox_head is not None
+
+    @property
+    def with_img_bbox(self):
+        """bool: Whether the detector has a 2D image box head."""
+        return hasattr(self,
+                       'img_bbox_head') and self.img_bbox_head is not None
+
+    @property
+    def with_img_backbone(self):
+        """bool: Whether the detector has a 2D image backbone."""
+        return hasattr(self, 'img_backbone') and self.img_backbone is not None
+
+    @property
+    def with_pts_backbone(self):
+        """bool: Whether the detector has a 3D backbone."""
+        return hasattr(self, 'pts_backbone') and self.pts_backbone is not None
+
+    @property
+    def with_fusion(self):
+        """bool: Whether the detector has a fusion layer."""
+        return hasattr(self,
+                       'pts_fusion_layer') and self.fusion_layer is not None
+
+    @property
+    def with_img_neck(self):
+        """bool: Whether the detector has a neck in image branch."""
+        return hasattr(self, 'img_neck') and self.img_neck is not None
+
+    @property
+    def with_pts_neck(self):
+        """bool: Whether the detector has a neck in 3D detector branch."""
+        return hasattr(self, 'pts_neck') and self.pts_neck is not None
+
+    @property
+    def with_img_rpn(self):
+        """bool: Whether the detector has a 2D RPN in image detector branch."""
+        return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None
+
+    @property
+    def with_img_roi_head(self):
+        """bool: Whether the detector has a RoI Head in image branch."""
+        return hasattr(self, 'img_roi_head') and self.img_roi_head is not None
+
+    @property
+    def with_voxel_encoder(self):
+        """bool: Whether the detector has a voxel encoder."""
+        return hasattr(self,
+                       'voxel_encoder') and self.voxel_encoder is not None
+
+    @property
+    def with_middle_encoder(self):
+        """bool: Whether the detector has a middle encoder."""
+        return hasattr(self,
+                       'middle_encoder') and self.middle_encoder is not None
+
+    def extract_img_feat(self, img, img_metas):
+        """Extract features of images."""
+        if self.with_img_backbone and img is not None:
+            input_shape = img.shape[-2:]
+            # update real input shape of each single img
+            for img_meta in img_metas:
+                img_meta.update(input_shape=input_shape)
+
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.view(B * N, C, H, W)
+            img_feats = self.img_backbone(img)
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+        return img_feats
+
+    def extract_pts_feat(self, pts, img_feats, img_metas):
+        """Extract features of points."""
+        if not self.with_pts_bbox:
+            return None
+        voxels, num_points, coors = self.voxelize(pts)
+        voxel_features = self.pts_voxel_encoder(voxels, num_points, coors,
+                                                img_feats, img_metas)
+        batch_size = coors[-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
+
+    def extract_feat(self, points, img, img_metas):
+        """Extract features from images and points."""
+        img_feats = self.extract_img_feat(img, img_metas)
+        pts_feats = self.extract_pts_feat(points, img_feats, img_metas)
+        return (img_feats, pts_feats)
+
+    @torch.no_grad()
+    @force_fp32()
+    def voxelize(self, points):
+        """Apply dynamic voxelization to points.
+
+        Args:
+            points (list[torch.Tensor]): Points of each sample.
+
+        Returns:
+            tuple[torch.Tensor]: Concatenated points, number of points
+                per voxel, and coordinates.
+        """
+        voxels, coors, num_points = [], [], []
+        for res in points:
+            res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
+            voxels.append(res_voxels)
+            coors.append(res_coors)
+            num_points.append(res_num_points)
+        voxels = torch.cat(voxels, dim=0)
+        num_points = torch.cat(num_points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return voxels, num_points, coors_batch
+
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None):
+        """Forward training function.
+
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor, optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+
+        Returns:
+            dict: Losses of different branches.
+        """
+        img_feats, pts_feats = self.extract_feat(
+            points, img=img, img_metas=img_metas)
+        losses = dict()
+        if pts_feats:
+            losses_pts = self.forward_pts_train(pts_feats, gt_bboxes_3d,
+                                                gt_labels_3d, img_metas,
+                                                gt_bboxes_ignore)
+            losses.update(losses_pts)
+        if img_feats:
+            losses_img = self.forward_img_train(
+                img_feats,
+                img_metas=img_metas,
+                gt_bboxes=gt_bboxes,
+                gt_labels=gt_labels,
+                gt_bboxes_ignore=gt_bboxes_ignore,
+                proposals=proposals)
+            losses.update(losses_img)
+        return losses
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_metas,
+                          gt_bboxes_ignore=None):
+        """Forward function for point cloud branch.
+
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        outs = self.pts_bbox_head(pts_feats)
+        loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas)
+        losses = self.pts_bbox_head.loss(
+            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def forward_img_train(self,
+                          x,
+                          img_metas,
+                          gt_bboxes,
+                          gt_labels,
+                          gt_bboxes_ignore=None,
+                          proposals=None,
+                          **kwargs):
+        """Forward function for image branch.
+
+        This function works similar to the forward function of Faster R-CNN.
+
+        Args:
+            x (list[torch.Tensor]): Image features of shape (B, C, H, W)
+                of multiple levels.
+            img_metas (list[dict]): Meta information of images.
+            gt_bboxes (list[torch.Tensor]): Ground truth boxes of each image
+                sample.
+            gt_labels (list[torch.Tensor]): Ground truth labels of boxes.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+            proposals (list[torch.Tensor], optional): Proposals of each sample.
+                Defaults to None.
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        losses = dict()
+        # RPN forward and loss
+        if self.with_img_rpn:
+            rpn_outs = self.img_rpn_head(x)
+            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas,
+                                          self.train_cfg.img_rpn)
+            rpn_losses = self.img_rpn_head.loss(
+                *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+            losses.update(rpn_losses)
+
+            proposal_cfg = self.train_cfg.get('img_rpn_proposal',
+                                              self.test_cfg.img_rpn)
+            proposal_inputs = rpn_outs + (img_metas, proposal_cfg)
+            proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
+        else:
+            proposal_list = proposals
+
+        # bbox head forward and loss
+        if self.with_img_bbox:
+            # bbox head forward and loss
+            img_roi_losses = self.img_roi_head.forward_train(
+                x, img_metas, proposal_list, gt_bboxes, gt_labels,
+                gt_bboxes_ignore, **kwargs)
+            losses.update(img_roi_losses)
+
+        return losses
+
+    def simple_test_img(self, x, img_metas, proposals=None, rescale=False):
+        """Test without augmentation."""
+        if proposals is None:
+            proposal_list = self.simple_test_rpn(x, img_metas,
+                                                 self.test_cfg.img_rpn)
+        else:
+            proposal_list = proposals
+
+        return self.img_roi_head.simple_test(
+            x, proposal_list, img_metas, rescale=rescale)
+
+    def simple_test_rpn(self, x, img_metas, rpn_test_cfg):
+        """RPN test function."""
+        rpn_outs = self.img_rpn_head(x)
+        proposal_inputs = rpn_outs + (img_metas, rpn_test_cfg)
+        proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
+        return proposal_list
+
+    def simple_test_pts(self, x, img_metas, rescale=False):
+        """Test function of point cloud branch."""
+        outs = self.pts_bbox_head(x)
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            *outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def simple_test(self, points, img_metas, img=None, rescale=False):
+        """Test function without augmentaiton."""
+        img_feats, pts_feats = self.extract_feat(
+            points, img=img, img_metas=img_metas)
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        if pts_feats and self.with_pts_bbox:
+            bbox_pts = self.simple_test_pts(
+                pts_feats, img_metas, rescale=rescale)
+            for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+                result_dict['pts_bbox'] = pts_bbox
+        if img_feats and self.with_img_bbox:
+            bbox_img = self.simple_test_img(
+                img_feats, img_metas, rescale=rescale)
+            for result_dict, img_bbox in zip(bbox_list, bbox_img):
+                result_dict['img_bbox'] = img_bbox
+        return bbox_list
+
+    def aug_test(self, points, img_metas, imgs=None, rescale=False):
+        """Test function with augmentaiton."""
+        img_feats, pts_feats = self.extract_feats(points, img_metas, imgs)
+
+        bbox_list = dict()
+        if pts_feats and self.with_pts_bbox:
+            bbox_pts = self.aug_test_pts(pts_feats, img_metas, rescale)
+            bbox_list.update(pts_bbox=bbox_pts)
+        return [bbox_list]
+
+    def extract_feats(self, points, img_metas, imgs=None):
+        """Extract point and image features of multiple samples."""
+        if imgs is None:
+            imgs = [None] * len(img_metas)
+        img_feats, pts_feats = multi_apply(self.extract_feat, points, imgs,
+                                           img_metas)
+        return img_feats, pts_feats
+
+    def aug_test_pts(self, feats, img_metas, rescale=False):
+        """Test function of point cloud branch with augmentaiton."""
+        # only support aug_test for one sample
+        aug_bboxes = []
+        for x, img_meta in zip(feats, img_metas):
+            outs = self.pts_bbox_head(x)
+            bbox_list = self.pts_bbox_head.get_bboxes(
+                *outs, img_meta, rescale=rescale)
+            bbox_list = [
+                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
+                for bboxes, scores, labels in bbox_list
+            ]
+            aug_bboxes.append(bbox_list[0])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
+                                            self.pts_bbox_head.test_cfg)
+        return merged_bboxes
+
+    def show_results(self, data, result, out_dir):
+        """Results visualization.
+
+        Args:
+            data (dict): Input points and the information of the sample.
+            result (dict): Prediction results.
+            out_dir (str): Output directory of visualization result.
+        """
+        for batch_id in range(len(result)):
+            if isinstance(data['points'][0], DC):
+                points = data['points'][0]._data[0][batch_id].numpy()
+            elif mmcv.is_list_of(data['points'][0], torch.Tensor):
+                points = data['points'][0][batch_id]
+            else:
+                ValueError(f"Unsupported data type {type(data['points'][0])} "
+                           f'for visualization!')
+            if isinstance(data['img_metas'][0], DC):
+                pts_filename = data['img_metas'][0]._data[0][batch_id][
+                    'pts_filename']
+                box_mode_3d = data['img_metas'][0]._data[0][batch_id][
+                    'box_mode_3d']
+            elif mmcv.is_list_of(data['img_metas'][0], dict):
+                pts_filename = data['img_metas'][0][batch_id]['pts_filename']
+                box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d']
+            else:
+                ValueError(
+                    f"Unsupported data type {type(data['img_metas'][0])} "
+                    f'for visualization!')
+            file_name = osp.split(pts_filename)[-1].split('.')[0]
+
+            assert out_dir is not None, 'Expect out_dir, got none.'
+            inds = result[batch_id]['pts_bbox']['scores_3d'] > 0.1
+            pred_bboxes = result[batch_id]['pts_bbox']['boxes_3d'][inds]
+
+            # for now we convert points and bbox into depth mode
+            if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d
+                                                  == Box3DMode.LIDAR):
+                points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
+                                                   Coord3DMode.DEPTH)
+                pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d,
+                                                Box3DMode.DEPTH)
+            elif box_mode_3d != Box3DMode.DEPTH:
+                ValueError(
+                    f'Unsupported box_mode_3d {box_mode_3d} for conversion!')
+
+            pred_bboxes = pred_bboxes.tensor.cpu().numpy()
+            show_result(points, None, pred_bboxes, out_dir, file_name)
diff --git a/mmdet3d/models/detectors/parta2.py b/mmdet3d/models/detectors/parta2.py
index 459a915..a4fdb5e 100644
--- a/mmdet3d/models/detectors/parta2.py
+++ b/mmdet3d/models/detectors/parta2.py
@@ -1,151 +1,151 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.ops import Voxelization
-from torch.nn import functional as F
-
-from .. import builder
-from ..builder import DETECTORS
-from .two_stage import TwoStage3DDetector
-
-
-@DETECTORS.register_module()
-class PartA2(TwoStage3DDetector):
-    r"""Part-A2 detector.
-
-    Please refer to the `paper <https://arxiv.org/abs/1907.03670>`_
-    """
-
-    def __init__(self,
-                 voxel_layer,
-                 voxel_encoder,
-                 middle_encoder,
-                 backbone,
-                 neck=None,
-                 rpn_head=None,
-                 roi_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(PartA2, self).__init__(
-            backbone=backbone,
-            neck=neck,
-            rpn_head=rpn_head,
-            roi_head=roi_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            pretrained=pretrained,
-            init_cfg=init_cfg)
-        self.voxel_layer = Voxelization(**voxel_layer)
-        self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder)
-        self.middle_encoder = builder.build_middle_encoder(middle_encoder)
-
-    def extract_feat(self, points, img_metas):
-        """Extract features from points."""
-        voxel_dict = self.voxelize(points)
-        voxel_features = self.voxel_encoder(voxel_dict['voxels'],
-                                            voxel_dict['num_points'],
-                                            voxel_dict['coors'])
-        batch_size = voxel_dict['coors'][-1, 0].item() + 1
-        feats_dict = self.middle_encoder(voxel_features, voxel_dict['coors'],
-                                         batch_size)
-        x = self.backbone(feats_dict['spatial_features'])
-        if self.with_neck:
-            neck_feats = self.neck(x)
-            feats_dict.update({'neck_feats': neck_feats})
-        return feats_dict, voxel_dict
-
-    @torch.no_grad()
-    def voxelize(self, points):
-        """Apply hard voxelization to points."""
-        voxels, coors, num_points, voxel_centers = [], [], [], []
-        for res in points:
-            res_voxels, res_coors, res_num_points = self.voxel_layer(res)
-            res_voxel_centers = (
-                res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
-                    self.voxel_layer.voxel_size) + res_voxels.new_tensor(
-                        self.voxel_layer.point_cloud_range[0:3])
-            voxels.append(res_voxels)
-            coors.append(res_coors)
-            num_points.append(res_num_points)
-            voxel_centers.append(res_voxel_centers)
-
-        voxels = torch.cat(voxels, dim=0)
-        num_points = torch.cat(num_points, dim=0)
-        voxel_centers = torch.cat(voxel_centers, dim=0)
-        coors_batch = []
-        for i, coor in enumerate(coors):
-            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-            coors_batch.append(coor_pad)
-        coors_batch = torch.cat(coors_batch, dim=0)
-
-        voxel_dict = dict(
-            voxels=voxels,
-            num_points=num_points,
-            coors=coors_batch,
-            voxel_centers=voxel_centers)
-        return voxel_dict
-
-    def forward_train(self,
-                      points,
-                      img_metas,
-                      gt_bboxes_3d,
-                      gt_labels_3d,
-                      gt_bboxes_ignore=None,
-                      proposals=None):
-        """Training forward function.
-
-        Args:
-            points (list[torch.Tensor]): Point cloud of each sample.
-            img_metas (list[dict]): Meta information of each sample
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes for each sample.
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
-                boxes of each sampole
-            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
-                boxes to be ignored. Defaults to None.
-
-        Returns:
-            dict: Losses of each branch.
-        """
-        feats_dict, voxels_dict = self.extract_feat(points, img_metas)
-
-        losses = dict()
-
-        if self.with_rpn:
-            rpn_outs = self.rpn_head(feats_dict['neck_feats'])
-            rpn_loss_inputs = rpn_outs + (gt_bboxes_3d, gt_labels_3d,
-                                          img_metas)
-            rpn_losses = self.rpn_head.loss(
-                *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
-            losses.update(rpn_losses)
-
-            proposal_cfg = self.train_cfg.get('rpn_proposal',
-                                              self.test_cfg.rpn)
-            proposal_inputs = rpn_outs + (img_metas, proposal_cfg)
-            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
-        else:
-            proposal_list = proposals
-
-        roi_losses = self.roi_head.forward_train(feats_dict, voxels_dict,
-                                                 img_metas, proposal_list,
-                                                 gt_bboxes_3d, gt_labels_3d)
-
-        losses.update(roi_losses)
-
-        return losses
-
-    def simple_test(self, points, img_metas, proposals=None, rescale=False):
-        """Test function without augmentaiton."""
-        feats_dict, voxels_dict = self.extract_feat(points, img_metas)
-
-        if self.with_rpn:
-            rpn_outs = self.rpn_head(feats_dict['neck_feats'])
-            proposal_cfg = self.test_cfg.rpn
-            bbox_inputs = rpn_outs + (img_metas, proposal_cfg)
-            proposal_list = self.rpn_head.get_bboxes(*bbox_inputs)
-        else:
-            proposal_list = proposals
-
-        return self.roi_head.simple_test(feats_dict, voxels_dict, img_metas,
-                                         proposal_list)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import Voxelization
+from torch.nn import functional as F
+
+from .. import builder
+from ..builder import DETECTORS
+from .two_stage import TwoStage3DDetector
+
+
+@DETECTORS.register_module()
+class PartA2(TwoStage3DDetector):
+    r"""Part-A2 detector.
+
+    Please refer to the `paper <https://arxiv.org/abs/1907.03670>`_
+    """
+
+    def __init__(self,
+                 voxel_layer,
+                 voxel_encoder,
+                 middle_encoder,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 roi_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(PartA2, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+        self.voxel_layer = Voxelization(**voxel_layer)
+        self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder)
+        self.middle_encoder = builder.build_middle_encoder(middle_encoder)
+
+    def extract_feat(self, points, img_metas):
+        """Extract features from points."""
+        voxel_dict = self.voxelize(points)
+        voxel_features = self.voxel_encoder(voxel_dict['voxels'],
+                                            voxel_dict['num_points'],
+                                            voxel_dict['coors'])
+        batch_size = voxel_dict['coors'][-1, 0].item() + 1
+        feats_dict = self.middle_encoder(voxel_features, voxel_dict['coors'],
+                                         batch_size)
+        x = self.backbone(feats_dict['spatial_features'])
+        if self.with_neck:
+            neck_feats = self.neck(x)
+            feats_dict.update({'neck_feats': neck_feats})
+        return feats_dict, voxel_dict
+
+    @torch.no_grad()
+    def voxelize(self, points):
+        """Apply hard voxelization to points."""
+        voxels, coors, num_points, voxel_centers = [], [], [], []
+        for res in points:
+            res_voxels, res_coors, res_num_points = self.voxel_layer(res)
+            res_voxel_centers = (
+                res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
+                    self.voxel_layer.voxel_size) + res_voxels.new_tensor(
+                        self.voxel_layer.point_cloud_range[0:3])
+            voxels.append(res_voxels)
+            coors.append(res_coors)
+            num_points.append(res_num_points)
+            voxel_centers.append(res_voxel_centers)
+
+        voxels = torch.cat(voxels, dim=0)
+        num_points = torch.cat(num_points, dim=0)
+        voxel_centers = torch.cat(voxel_centers, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+
+        voxel_dict = dict(
+            voxels=voxels,
+            num_points=num_points,
+            coors=coors_batch,
+            voxel_centers=voxel_centers)
+        return voxel_dict
+
+    def forward_train(self,
+                      points,
+                      img_metas,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      gt_bboxes_ignore=None,
+                      proposals=None):
+        """Training forward function.
+
+        Args:
+            points (list[torch.Tensor]): Point cloud of each sample.
+            img_metas (list[dict]): Meta information of each sample
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        feats_dict, voxels_dict = self.extract_feat(points, img_metas)
+
+        losses = dict()
+
+        if self.with_rpn:
+            rpn_outs = self.rpn_head(feats_dict['neck_feats'])
+            rpn_loss_inputs = rpn_outs + (gt_bboxes_3d, gt_labels_3d,
+                                          img_metas)
+            rpn_losses = self.rpn_head.loss(
+                *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+            losses.update(rpn_losses)
+
+            proposal_cfg = self.train_cfg.get('rpn_proposal',
+                                              self.test_cfg.rpn)
+            proposal_inputs = rpn_outs + (img_metas, proposal_cfg)
+            proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
+        else:
+            proposal_list = proposals
+
+        roi_losses = self.roi_head.forward_train(feats_dict, voxels_dict,
+                                                 img_metas, proposal_list,
+                                                 gt_bboxes_3d, gt_labels_3d)
+
+        losses.update(roi_losses)
+
+        return losses
+
+    def simple_test(self, points, img_metas, proposals=None, rescale=False):
+        """Test function without augmentaiton."""
+        feats_dict, voxels_dict = self.extract_feat(points, img_metas)
+
+        if self.with_rpn:
+            rpn_outs = self.rpn_head(feats_dict['neck_feats'])
+            proposal_cfg = self.test_cfg.rpn
+            bbox_inputs = rpn_outs + (img_metas, proposal_cfg)
+            proposal_list = self.rpn_head.get_bboxes(*bbox_inputs)
+        else:
+            proposal_list = proposals
+
+        return self.roi_head.simple_test(feats_dict, voxels_dict, img_metas,
+                                         proposal_list)
diff --git a/mmdet3d/models/detectors/point_rcnn.py b/mmdet3d/models/detectors/point_rcnn.py
index 31c8693..e1067bc 100644
--- a/mmdet3d/models/detectors/point_rcnn.py
+++ b/mmdet3d/models/detectors/point_rcnn.py
@@ -1,148 +1,148 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from ..builder import DETECTORS
-from .two_stage import TwoStage3DDetector
-
-
-@DETECTORS.register_module()
-class PointRCNN(TwoStage3DDetector):
-    r"""PointRCNN detector.
-
-    Please refer to the `PointRCNN <https://arxiv.org/abs/1812.04244>`_
-
-    Args:
-        backbone (dict): Config dict of detector's backbone.
-        neck (dict, optional): Config dict of neck. Defaults to None.
-        rpn_head (dict, optional): Config of RPN head. Defaults to None.
-        roi_head (dict, optional): Config of ROI head. Defaults to None.
-        train_cfg (dict, optional): Train configs. Defaults to None.
-        test_cfg (dict, optional): Test configs. Defaults to None.
-        pretrained (str, optional): Model pretrained path. Defaults to None.
-        init_cfg (dict, optional): Config of initialization. Defaults to None.
-    """
-
-    def __init__(self,
-                 backbone,
-                 neck=None,
-                 rpn_head=None,
-                 roi_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(PointRCNN, self).__init__(
-            backbone=backbone,
-            neck=neck,
-            rpn_head=rpn_head,
-            roi_head=roi_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            pretrained=pretrained,
-            init_cfg=init_cfg)
-
-    def extract_feat(self, points):
-        """Directly extract features from the backbone+neck.
-
-        Args:
-            points (torch.Tensor): Input points.
-
-        Returns:
-            dict: Features from the backbone+neck
-        """
-        x = self.backbone(points)
-
-        if self.with_neck:
-            x = self.neck(x)
-        return x
-
-    def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d):
-        """Forward of training.
-
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            img_metas (list[dict]): Meta information of each sample.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
-
-        Returns:
-            dict: Losses.
-        """
-        losses = dict()
-        points_cat = torch.stack(points)
-        x = self.extract_feat(points_cat)
-
-        # features for rcnn
-        backbone_feats = x['fp_features'].clone()
-        backbone_xyz = x['fp_xyz'].clone()
-        rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz}
-
-        bbox_preds, cls_preds = self.rpn_head(x)
-
-        rpn_loss = self.rpn_head.loss(
-            bbox_preds=bbox_preds,
-            cls_preds=cls_preds,
-            points=points,
-            gt_bboxes_3d=gt_bboxes_3d,
-            gt_labels_3d=gt_labels_3d,
-            img_metas=img_metas)
-        losses.update(rpn_loss)
-
-        bbox_list = self.rpn_head.get_bboxes(points_cat, bbox_preds, cls_preds,
-                                             img_metas)
-        proposal_list = [
-            dict(
-                boxes_3d=bboxes,
-                scores_3d=scores,
-                labels_3d=labels,
-                cls_preds=preds_cls)
-            for bboxes, scores, labels, preds_cls in bbox_list
-        ]
-        rcnn_feats.update({'points_cls_preds': cls_preds})
-
-        roi_losses = self.roi_head.forward_train(rcnn_feats, img_metas,
-                                                 proposal_list, gt_bboxes_3d,
-                                                 gt_labels_3d)
-        losses.update(roi_losses)
-
-        return losses
-
-    def simple_test(self, points, img_metas, imgs=None, rescale=False):
-        """Forward of testing.
-
-        Args:
-            points (list[torch.Tensor]): Points of each sample.
-            img_metas (list[dict]): Image metas.
-            imgs (list[torch.Tensor], optional): Images of each sample.
-                Defaults to None.
-            rescale (bool, optional): Whether to rescale results.
-                Defaults to False.
-
-        Returns:
-            list: Predicted 3d boxes.
-        """
-        points_cat = torch.stack(points)
-
-        x = self.extract_feat(points_cat)
-        # features for rcnn
-        backbone_feats = x['fp_features'].clone()
-        backbone_xyz = x['fp_xyz'].clone()
-        rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz}
-        bbox_preds, cls_preds = self.rpn_head(x)
-        rcnn_feats.update({'points_cls_preds': cls_preds})
-
-        bbox_list = self.rpn_head.get_bboxes(
-            points_cat, bbox_preds, cls_preds, img_metas, rescale=rescale)
-
-        proposal_list = [
-            dict(
-                boxes_3d=bboxes,
-                scores_3d=scores,
-                labels_3d=labels,
-                cls_preds=preds_cls)
-            for bboxes, scores, labels, preds_cls in bbox_list
-        ]
-        bbox_results = self.roi_head.simple_test(rcnn_feats, img_metas,
-                                                 proposal_list)
-
-        return bbox_results
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..builder import DETECTORS
+from .two_stage import TwoStage3DDetector
+
+
+@DETECTORS.register_module()
+class PointRCNN(TwoStage3DDetector):
+    r"""PointRCNN detector.
+
+    Please refer to the `PointRCNN <https://arxiv.org/abs/1812.04244>`_
+
+    Args:
+        backbone (dict): Config dict of detector's backbone.
+        neck (dict, optional): Config dict of neck. Defaults to None.
+        rpn_head (dict, optional): Config of RPN head. Defaults to None.
+        roi_head (dict, optional): Config of ROI head. Defaults to None.
+        train_cfg (dict, optional): Train configs. Defaults to None.
+        test_cfg (dict, optional): Test configs. Defaults to None.
+        pretrained (str, optional): Model pretrained path. Defaults to None.
+        init_cfg (dict, optional): Config of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 roi_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(PointRCNN, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            rpn_head=rpn_head,
+            roi_head=roi_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def extract_feat(self, points):
+        """Directly extract features from the backbone+neck.
+
+        Args:
+            points (torch.Tensor): Input points.
+
+        Returns:
+            dict: Features from the backbone+neck
+        """
+        x = self.backbone(points)
+
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d):
+        """Forward of training.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            img_metas (list[dict]): Meta information of each sample.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
+            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
+
+        Returns:
+            dict: Losses.
+        """
+        losses = dict()
+        points_cat = torch.stack(points)
+        x = self.extract_feat(points_cat)
+
+        # features for rcnn
+        backbone_feats = x['fp_features'].clone()
+        backbone_xyz = x['fp_xyz'].clone()
+        rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz}
+
+        bbox_preds, cls_preds = self.rpn_head(x)
+
+        rpn_loss = self.rpn_head.loss(
+            bbox_preds=bbox_preds,
+            cls_preds=cls_preds,
+            points=points,
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            img_metas=img_metas)
+        losses.update(rpn_loss)
+
+        bbox_list = self.rpn_head.get_bboxes(points_cat, bbox_preds, cls_preds,
+                                             img_metas)
+        proposal_list = [
+            dict(
+                boxes_3d=bboxes,
+                scores_3d=scores,
+                labels_3d=labels,
+                cls_preds=preds_cls)
+            for bboxes, scores, labels, preds_cls in bbox_list
+        ]
+        rcnn_feats.update({'points_cls_preds': cls_preds})
+
+        roi_losses = self.roi_head.forward_train(rcnn_feats, img_metas,
+                                                 proposal_list, gt_bboxes_3d,
+                                                 gt_labels_3d)
+        losses.update(roi_losses)
+
+        return losses
+
+    def simple_test(self, points, img_metas, imgs=None, rescale=False):
+        """Forward of testing.
+
+        Args:
+            points (list[torch.Tensor]): Points of each sample.
+            img_metas (list[dict]): Image metas.
+            imgs (list[torch.Tensor], optional): Images of each sample.
+                Defaults to None.
+            rescale (bool, optional): Whether to rescale results.
+                Defaults to False.
+
+        Returns:
+            list: Predicted 3d boxes.
+        """
+        points_cat = torch.stack(points)
+
+        x = self.extract_feat(points_cat)
+        # features for rcnn
+        backbone_feats = x['fp_features'].clone()
+        backbone_xyz = x['fp_xyz'].clone()
+        rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz}
+        bbox_preds, cls_preds = self.rpn_head(x)
+        rcnn_feats.update({'points_cls_preds': cls_preds})
+
+        bbox_list = self.rpn_head.get_bboxes(
+            points_cat, bbox_preds, cls_preds, img_metas, rescale=rescale)
+
+        proposal_list = [
+            dict(
+                boxes_3d=bboxes,
+                scores_3d=scores,
+                labels_3d=labels,
+                cls_preds=preds_cls)
+            for bboxes, scores, labels, preds_cls in bbox_list
+        ]
+        bbox_results = self.roi_head.simple_test(rcnn_feats, img_metas,
+                                                 proposal_list)
+
+        return bbox_results
diff --git a/mmdet3d/models/detectors/sassd.py b/mmdet3d/models/detectors/sassd.py
index 2151c4e..2ccf966 100644
--- a/mmdet3d/models/detectors/sassd.py
+++ b/mmdet3d/models/detectors/sassd.py
@@ -1,136 +1,136 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.ops import Voxelization
-from mmcv.runner import force_fp32
-from torch.nn import functional as F
-
-from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
-from mmdet.models.builder import DETECTORS
-from .. import builder
-from .single_stage import SingleStage3DDetector
-
-
-@DETECTORS.register_module()
-class SASSD(SingleStage3DDetector):
-    r"""`SASSD <https://github.com/skyhehe123/SA-SSD>` _ for 3D detection."""
-
-    def __init__(self,
-                 voxel_layer,
-                 voxel_encoder,
-                 middle_encoder,
-                 backbone,
-                 neck=None,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 init_cfg=None,
-                 pretrained=None):
-        super(SASSD, self).__init__(
-            backbone=backbone,
-            neck=neck,
-            bbox_head=bbox_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            init_cfg=init_cfg,
-            pretrained=pretrained)
-
-        self.voxel_layer = Voxelization(**voxel_layer)
-        self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder)
-        self.middle_encoder = builder.build_middle_encoder(middle_encoder)
-
-    def extract_feat(self, points, img_metas=None, test_mode=False):
-        """Extract features from points."""
-        voxels, num_points, coors = self.voxelize(points)
-        voxel_features = self.voxel_encoder(voxels, num_points, coors)
-        batch_size = coors[-1, 0].item() + 1
-        x, point_misc = self.middle_encoder(voxel_features, coors, batch_size,
-                                            test_mode)
-        x = self.backbone(x)
-        if self.with_neck:
-            x = self.neck(x)
-        return x, point_misc
-
-    @torch.no_grad()
-    @force_fp32()
-    def voxelize(self, points):
-        """Apply hard voxelization to points."""
-        voxels, coors, num_points = [], [], []
-        for res in points:
-            res_voxels, res_coors, res_num_points = self.voxel_layer(res)
-            voxels.append(res_voxels)
-            coors.append(res_coors)
-            num_points.append(res_num_points)
-        voxels = torch.cat(voxels, dim=0)
-        num_points = torch.cat(num_points, dim=0)
-        coors_batch = []
-        for i, coor in enumerate(coors):
-            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-            coors_batch.append(coor_pad)
-        coors_batch = torch.cat(coors_batch, dim=0)
-        return voxels, num_points, coors_batch
-
-    def forward_train(self,
-                      points,
-                      img_metas,
-                      gt_bboxes_3d,
-                      gt_labels_3d,
-                      gt_bboxes_ignore=None):
-        """Training forward function.
-
-        Args:
-            points (list[torch.Tensor]): Point cloud of each sample.
-            img_metas (list[dict]): Meta information of each sample
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes for each sample.
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
-                boxes of each sampole
-            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
-                boxes to be ignored. Defaults to None.
-
-        Returns:
-            dict: Losses of each branch.
-        """
-
-        x, point_misc = self.extract_feat(points, img_metas, test_mode=False)
-        aux_loss = self.middle_encoder.aux_loss(*point_misc, gt_bboxes_3d)
-
-        outs = self.bbox_head(x)
-        loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas)
-        losses = self.bbox_head.loss(
-            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
-        losses.update(aux_loss)
-        return losses
-
-    def simple_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test function without augmentaiton."""
-        x, _ = self.extract_feat(points, img_metas, test_mode=True)
-        outs = self.bbox_head(x)
-        bbox_list = self.bbox_head.get_bboxes(
-            *outs, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def aug_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test function with augmentaiton."""
-        feats = self.extract_feats(points, img_metas, test_mode=True)
-
-        # only support aug_test for one sample
-        aug_bboxes = []
-        for x, img_meta in zip(feats, img_metas):
-            outs = self.bbox_head(x)
-            bbox_list = self.bbox_head.get_bboxes(
-                *outs, img_meta, rescale=rescale)
-            bbox_list = [
-                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
-                for bboxes, scores, labels in bbox_list
-            ]
-            aug_bboxes.append(bbox_list[0])
-
-        # after merging, bboxes will be rescaled to the original image size
-        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
-                                            self.bbox_head.test_cfg)
-
-        return [merged_bboxes]
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import Voxelization
+from mmcv.runner import force_fp32
+from torch.nn import functional as F
+
+from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
+from mmdet.models.builder import DETECTORS
+from .. import builder
+from .single_stage import SingleStage3DDetector
+
+
+@DETECTORS.register_module()
+class SASSD(SingleStage3DDetector):
+    r"""`SASSD <https://github.com/skyhehe123/SA-SSD>` _ for 3D detection."""
+
+    def __init__(self,
+                 voxel_layer,
+                 voxel_encoder,
+                 middle_encoder,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 pretrained=None):
+        super(SASSD, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            pretrained=pretrained)
+
+        self.voxel_layer = Voxelization(**voxel_layer)
+        self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder)
+        self.middle_encoder = builder.build_middle_encoder(middle_encoder)
+
+    def extract_feat(self, points, img_metas=None, test_mode=False):
+        """Extract features from points."""
+        voxels, num_points, coors = self.voxelize(points)
+        voxel_features = self.voxel_encoder(voxels, num_points, coors)
+        batch_size = coors[-1, 0].item() + 1
+        x, point_misc = self.middle_encoder(voxel_features, coors, batch_size,
+                                            test_mode)
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+        return x, point_misc
+
+    @torch.no_grad()
+    @force_fp32()
+    def voxelize(self, points):
+        """Apply hard voxelization to points."""
+        voxels, coors, num_points = [], [], []
+        for res in points:
+            res_voxels, res_coors, res_num_points = self.voxel_layer(res)
+            voxels.append(res_voxels)
+            coors.append(res_coors)
+            num_points.append(res_num_points)
+        voxels = torch.cat(voxels, dim=0)
+        num_points = torch.cat(num_points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return voxels, num_points, coors_batch
+
+    def forward_train(self,
+                      points,
+                      img_metas,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      gt_bboxes_ignore=None):
+        """Training forward function.
+
+        Args:
+            points (list[torch.Tensor]): Point cloud of each sample.
+            img_metas (list[dict]): Meta information of each sample
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+
+        Returns:
+            dict: Losses of each branch.
+        """
+
+        x, point_misc = self.extract_feat(points, img_metas, test_mode=False)
+        aux_loss = self.middle_encoder.aux_loss(*point_misc, gt_bboxes_3d)
+
+        outs = self.bbox_head(x)
+        loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas)
+        losses = self.bbox_head.loss(
+            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        losses.update(aux_loss)
+        return losses
+
+    def simple_test(self, points, img_metas, imgs=None, rescale=False):
+        """Test function without augmentaiton."""
+        x, _ = self.extract_feat(points, img_metas, test_mode=True)
+        outs = self.bbox_head(x)
+        bbox_list = self.bbox_head.get_bboxes(
+            *outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def aug_test(self, points, img_metas, imgs=None, rescale=False):
+        """Test function with augmentaiton."""
+        feats = self.extract_feats(points, img_metas, test_mode=True)
+
+        # only support aug_test for one sample
+        aug_bboxes = []
+        for x, img_meta in zip(feats, img_metas):
+            outs = self.bbox_head(x)
+            bbox_list = self.bbox_head.get_bboxes(
+                *outs, img_meta, rescale=rescale)
+            bbox_list = [
+                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
+                for bboxes, scores, labels in bbox_list
+            ]
+            aug_bboxes.append(bbox_list[0])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
+                                            self.bbox_head.test_cfg)
+
+        return [merged_bboxes]
diff --git a/mmdet3d/models/detectors/single_stage.py b/mmdet3d/models/detectors/single_stage.py
index 11f8479..f81dec9 100644
--- a/mmdet3d/models/detectors/single_stage.py
+++ b/mmdet3d/models/detectors/single_stage.py
@@ -1,71 +1,71 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ..builder import DETECTORS, build_backbone, build_head, build_neck
-from .base import Base3DDetector
-
-
-@DETECTORS.register_module()
-class SingleStage3DDetector(Base3DDetector):
-    """SingleStage3DDetector.
-
-    This class serves as a base class for single-stage 3D detectors.
-
-    Args:
-        backbone (dict): Config dict of detector's backbone.
-        neck (dict, optional): Config dict of neck. Defaults to None.
-        bbox_head (dict, optional): Config dict of box head. Defaults to None.
-        train_cfg (dict, optional): Config dict of training hyper-parameters.
-            Defaults to None.
-        test_cfg (dict, optional): Config dict of test hyper-parameters.
-            Defaults to None.
-        pretrained (str, optional): Path of pretrained models.
-            Defaults to None.
-    """
-
-    def __init__(self,
-                 backbone,
-                 neck=None,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 init_cfg=None,
-                 pretrained=None):
-        super(SingleStage3DDetector, self).__init__(init_cfg)
-        self.backbone = build_backbone(backbone)
-        if neck is not None:
-            self.neck = build_neck(neck)
-        bbox_head.update(train_cfg=train_cfg)
-        bbox_head.update(test_cfg=test_cfg)
-        self.bbox_head = build_head(bbox_head)
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-
-    def forward_dummy(self, points):
-        """Used for computing network flops.
-
-        See `mmdetection/tools/analysis_tools/get_flops.py`
-        """
-        x = self.extract_feat(points)
-        try:
-            sample_mod = self.train_cfg.sample_mod
-            outs = self.bbox_head(x, sample_mod)
-        except AttributeError:
-            outs = self.bbox_head(x)
-        return outs
-
-    def extract_feat(self, points, img_metas=None):
-        """Directly extract features from the backbone+neck.
-
-        Args:
-            points (torch.Tensor): Input points.
-        """
-        x = self.backbone(points)
-        if self.with_neck:
-            x = self.neck(x)
-        return x
-
-    def extract_feats(self, points, img_metas):
-        """Extract features of multiple samples."""
-        return [
-            self.extract_feat(pts, img_meta)
-            for pts, img_meta in zip(points, img_metas)
-        ]
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .base import Base3DDetector
+
+
+@DETECTORS.register_module()
+class SingleStage3DDetector(Base3DDetector):
+    """SingleStage3DDetector.
+
+    This class serves as a base class for single-stage 3D detectors.
+
+    Args:
+        backbone (dict): Config dict of detector's backbone.
+        neck (dict, optional): Config dict of neck. Defaults to None.
+        bbox_head (dict, optional): Config dict of box head. Defaults to None.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        pretrained (str, optional): Path of pretrained models.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 pretrained=None):
+        super(SingleStage3DDetector, self).__init__(init_cfg)
+        self.backbone = build_backbone(backbone)
+        if neck is not None:
+            self.neck = build_neck(neck)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = build_head(bbox_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def forward_dummy(self, points):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/analysis_tools/get_flops.py`
+        """
+        x = self.extract_feat(points)
+        try:
+            sample_mod = self.train_cfg.sample_mod
+            outs = self.bbox_head(x, sample_mod)
+        except AttributeError:
+            outs = self.bbox_head(x)
+        return outs
+
+    def extract_feat(self, points, img_metas=None):
+        """Directly extract features from the backbone+neck.
+
+        Args:
+            points (torch.Tensor): Input points.
+        """
+        x = self.backbone(points)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def extract_feats(self, points, img_metas):
+        """Extract features of multiple samples."""
+        return [
+            self.extract_feat(pts, img_meta)
+            for pts, img_meta in zip(points, img_metas)
+        ]
diff --git a/mmdet3d/models/detectors/single_stage_mono3d.py b/mmdet3d/models/detectors/single_stage_mono3d.py
index 464fab0..7e58c4f 100644
--- a/mmdet3d/models/detectors/single_stage_mono3d.py
+++ b/mmdet3d/models/detectors/single_stage_mono3d.py
@@ -1,250 +1,250 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from os import path as osp
-
-import mmcv
-import numpy as np
-import torch
-from mmcv.parallel import DataContainer as DC
-
-from mmdet3d.core import (CameraInstance3DBoxes, bbox3d2result,
-                          show_multi_modality_result)
-from mmdet.models.detectors import SingleStageDetector
-from ..builder import DETECTORS, build_backbone, build_head, build_neck
-
-
-@DETECTORS.register_module()
-class SingleStageMono3DDetector(SingleStageDetector):
-    """Base class for monocular 3D single-stage detectors.
-
-    Single-stage detectors directly and densely predict bounding boxes on the
-    output features of the backbone+neck.
-    """
-
-    def __init__(self,
-                 backbone,
-                 neck=None,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(SingleStageDetector, self).__init__(init_cfg)
-        if pretrained:
-            warnings.warn('DeprecationWarning: pretrained is deprecated, '
-                          'please use "init_cfg" instead')
-            backbone.pretrained = pretrained
-        self.backbone = build_backbone(backbone)
-        if neck is not None:
-            self.neck = build_neck(neck)
-        bbox_head.update(train_cfg=train_cfg)
-        bbox_head.update(test_cfg=test_cfg)
-        self.bbox_head = build_head(bbox_head)
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-
-    def extract_feats(self, imgs):
-        """Directly extract features from the backbone+neck."""
-        assert isinstance(imgs, list)
-        return [self.extract_feat(img) for img in imgs]
-
-    def forward_train(self,
-                      img,
-                      img_metas,
-                      gt_bboxes,
-                      gt_labels,
-                      gt_bboxes_3d,
-                      gt_labels_3d,
-                      centers2d,
-                      depths,
-                      attr_labels=None,
-                      gt_bboxes_ignore=None):
-        """
-        Args:
-            img (Tensor): Input images of shape (N, C, H, W).
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): A List of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                :class:`mmdet.datasets.pipelines.Collect`.
-            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
-                image in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[Tensor]): Class indices corresponding to each box
-            gt_bboxes_3d (list[Tensor]): Each item are the 3D truth boxes for
-                each image in [x, y, z, x_size, y_size, z_size, yaw, vx, vy]
-                format.
-            gt_labels_3d (list[Tensor]): 3D class indices corresponding to
-                each box.
-            centers2d (list[Tensor]): Projected 3D centers onto 2D images.
-            depths (list[Tensor]): Depth of projected centers on 2D images.
-            attr_labels (list[Tensor], optional): Attribute indices
-                corresponding to each box
-            gt_bboxes_ignore (list[Tensor]): Specify which bounding
-                boxes can be ignored when computing the loss.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-        x = self.extract_feat(img)
-        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
-                                              gt_labels, gt_bboxes_3d,
-                                              gt_labels_3d, centers2d, depths,
-                                              attr_labels, gt_bboxes_ignore)
-        return losses
-
-    def simple_test(self, img, img_metas, rescale=False):
-        """Test function without test time augmentation.
-
-        Args:
-            imgs (list[torch.Tensor]): List of multiple images
-            img_metas (list[dict]): List of image information.
-            rescale (bool, optional): Whether to rescale the results.
-                Defaults to False.
-
-        Returns:
-            list[list[np.ndarray]]: BBox results of each image and classes.
-                The outer list corresponds to each image. The inner list
-                corresponds to each class.
-        """
-        x = self.extract_feat(img)
-        outs = self.bbox_head(x)
-        bbox_outputs = self.bbox_head.get_bboxes(
-            *outs, img_metas, rescale=rescale)
-
-        if self.bbox_head.pred_bbox2d:
-            from mmdet.core import bbox2result
-            bbox2d_img = [
-                bbox2result(bboxes2d, labels, self.bbox_head.num_classes)
-                for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs
-            ]
-            bbox_outputs = [bbox_outputs[0][:-1]]
-
-        bbox_img = [
-            bbox3d2result(bboxes, scores, labels, attrs)
-            for bboxes, scores, labels, attrs in bbox_outputs
-        ]
-
-        bbox_list = [dict() for i in range(len(img_metas))]
-        for result_dict, img_bbox in zip(bbox_list, bbox_img):
-            result_dict['img_bbox'] = img_bbox
-        if self.bbox_head.pred_bbox2d:
-            for result_dict, img_bbox2d in zip(bbox_list, bbox2d_img):
-                result_dict['img_bbox2d'] = img_bbox2d
-        return bbox_list
-
-    def aug_test(self, imgs, img_metas, rescale=False):
-        """Test function with test time augmentation."""
-        feats = self.extract_feats(imgs)
-
-        # only support aug_test for one sample
-        outs_list = [self.bbox_head(x) for x in feats]
-        for i, img_meta in enumerate(img_metas):
-            if img_meta[0]['pcd_horizontal_flip']:
-                for j in range(len(outs_list[i])):  # for each prediction
-                    if outs_list[i][j][0] is None:
-                        continue
-                    for k in range(len(outs_list[i][j])):
-                        # every stride of featmap
-                        outs_list[i][j][k] = torch.flip(
-                            outs_list[i][j][k], dims=[3])
-                reg = outs_list[i][1]
-                for reg_feat in reg:
-                    # offset_x
-                    reg_feat[:, 0, :, :] = 1 - reg_feat[:, 0, :, :]
-                    # velo_x
-                    if self.bbox_head.pred_velo:
-                        reg_feat[:, 7, :, :] = -reg_feat[:, 7, :, :]
-                    # rotation
-                    reg_feat[:, 6, :, :] = -reg_feat[:, 6, :, :] + np.pi
-
-        merged_outs = []
-        for i in range(len(outs_list[0])):  # for each prediction
-            merged_feats = []
-            for j in range(len(outs_list[0][i])):
-                if outs_list[0][i][0] is None:
-                    merged_feats.append(None)
-                    continue
-                # for each stride of featmap
-                avg_feats = torch.mean(
-                    torch.cat([x[i][j] for x in outs_list]),
-                    dim=0,
-                    keepdim=True)
-                if i == 1:  # regression predictions
-                    # rot/velo/2d det keeps the original
-                    avg_feats[:, 6:, :, :] = \
-                        outs_list[0][i][j][:, 6:, :, :]
-                if i == 2:
-                    # dir_cls keeps the original
-                    avg_feats = outs_list[0][i][j]
-                merged_feats.append(avg_feats)
-            merged_outs.append(merged_feats)
-        merged_outs = tuple(merged_outs)
-
-        bbox_outputs = self.bbox_head.get_bboxes(
-            *merged_outs, img_metas[0], rescale=rescale)
-        if self.bbox_head.pred_bbox2d:
-            from mmdet.core import bbox2result
-            bbox2d_img = [
-                bbox2result(bboxes2d, labels, self.bbox_head.num_classes)
-                for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs
-            ]
-            bbox_outputs = [bbox_outputs[0][:-1]]
-
-        bbox_img = [
-            bbox3d2result(bboxes, scores, labels, attrs)
-            for bboxes, scores, labels, attrs in bbox_outputs
-        ]
-
-        bbox_list = dict()
-        bbox_list.update(img_bbox=bbox_img[0])
-        if self.bbox_head.pred_bbox2d:
-            bbox_list.update(img_bbox2d=bbox2d_img[0])
-
-        return [bbox_list]
-
-    def show_results(self, data, result, out_dir, show=False, score_thr=None):
-        """Results visualization.
-
-        Args:
-            data (list[dict]): Input images and the information of the sample.
-            result (list[dict]): Prediction results.
-            out_dir (str): Output directory of visualization result.
-            show (bool, optional): Determines whether you are
-                going to show result by open3d.
-                Defaults to False.
-            TODO: implement score_thr of single_stage_mono3d.
-            score_thr (float, optional): Score threshold of bounding boxes.
-                Default to None.
-                Not implemented yet, but it is here for unification.
-        """
-        for batch_id in range(len(result)):
-            if isinstance(data['img_metas'][0], DC):
-                img_filename = data['img_metas'][0]._data[0][batch_id][
-                    'filename']
-                cam2img = data['img_metas'][0]._data[0][batch_id]['cam2img']
-            elif mmcv.is_list_of(data['img_metas'][0], dict):
-                img_filename = data['img_metas'][0][batch_id]['filename']
-                cam2img = data['img_metas'][0][batch_id]['cam2img']
-            else:
-                ValueError(
-                    f"Unsupported data type {type(data['img_metas'][0])} "
-                    f'for visualization!')
-            img = mmcv.imread(img_filename)
-            file_name = osp.split(img_filename)[-1].split('.')[0]
-
-            assert out_dir is not None, 'Expect out_dir, got none.'
-
-            pred_bboxes = result[batch_id]['img_bbox']['boxes_3d']
-            assert isinstance(pred_bboxes, CameraInstance3DBoxes), \
-                f'unsupported predicted bbox type {type(pred_bboxes)}'
-
-            show_multi_modality_result(
-                img,
-                None,
-                pred_bboxes,
-                cam2img,
-                out_dir,
-                file_name,
-                'camera',
-                show=show)
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from os import path as osp
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+
+from mmdet3d.core import (CameraInstance3DBoxes, bbox3d2result,
+                          show_multi_modality_result)
+from mmdet.models.detectors import SingleStageDetector
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+
+
+@DETECTORS.register_module()
+class SingleStageMono3DDetector(SingleStageDetector):
+    """Base class for monocular 3D single-stage detectors.
+
+    Single-stage detectors directly and densely predict bounding boxes on the
+    output features of the backbone+neck.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(SingleStageDetector, self).__init__(init_cfg)
+        if pretrained:
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            backbone.pretrained = pretrained
+        self.backbone = build_backbone(backbone)
+        if neck is not None:
+            self.neck = build_neck(neck)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = build_head(bbox_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def extract_feats(self, imgs):
+        """Directly extract features from the backbone+neck."""
+        assert isinstance(imgs, list)
+        return [self.extract_feat(img) for img in imgs]
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      centers2d,
+                      depths,
+                      attr_labels=None,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_3d (list[Tensor]): Each item are the 3D truth boxes for
+                each image in [x, y, z, x_size, y_size, z_size, yaw, vx, vy]
+                format.
+            gt_labels_3d (list[Tensor]): 3D class indices corresponding to
+                each box.
+            centers2d (list[Tensor]): Projected 3D centers onto 2D images.
+            depths (list[Tensor]): Depth of projected centers on 2D images.
+            attr_labels (list[Tensor], optional): Attribute indices
+                corresponding to each box
+            gt_bboxes_ignore (list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(img)
+        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
+                                              gt_labels, gt_bboxes_3d,
+                                              gt_labels_3d, centers2d, depths,
+                                              attr_labels, gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        bbox_outputs = self.bbox_head.get_bboxes(
+            *outs, img_metas, rescale=rescale)
+
+        if self.bbox_head.pred_bbox2d:
+            from mmdet.core import bbox2result
+            bbox2d_img = [
+                bbox2result(bboxes2d, labels, self.bbox_head.num_classes)
+                for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs
+            ]
+            bbox_outputs = [bbox_outputs[0][:-1]]
+
+        bbox_img = [
+            bbox3d2result(bboxes, scores, labels, attrs)
+            for bboxes, scores, labels, attrs in bbox_outputs
+        ]
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        for result_dict, img_bbox in zip(bbox_list, bbox_img):
+            result_dict['img_bbox'] = img_bbox
+        if self.bbox_head.pred_bbox2d:
+            for result_dict, img_bbox2d in zip(bbox_list, bbox2d_img):
+                result_dict['img_bbox2d'] = img_bbox2d
+        return bbox_list
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test function with test time augmentation."""
+        feats = self.extract_feats(imgs)
+
+        # only support aug_test for one sample
+        outs_list = [self.bbox_head(x) for x in feats]
+        for i, img_meta in enumerate(img_metas):
+            if img_meta[0]['pcd_horizontal_flip']:
+                for j in range(len(outs_list[i])):  # for each prediction
+                    if outs_list[i][j][0] is None:
+                        continue
+                    for k in range(len(outs_list[i][j])):
+                        # every stride of featmap
+                        outs_list[i][j][k] = torch.flip(
+                            outs_list[i][j][k], dims=[3])
+                reg = outs_list[i][1]
+                for reg_feat in reg:
+                    # offset_x
+                    reg_feat[:, 0, :, :] = 1 - reg_feat[:, 0, :, :]
+                    # velo_x
+                    if self.bbox_head.pred_velo:
+                        reg_feat[:, 7, :, :] = -reg_feat[:, 7, :, :]
+                    # rotation
+                    reg_feat[:, 6, :, :] = -reg_feat[:, 6, :, :] + np.pi
+
+        merged_outs = []
+        for i in range(len(outs_list[0])):  # for each prediction
+            merged_feats = []
+            for j in range(len(outs_list[0][i])):
+                if outs_list[0][i][0] is None:
+                    merged_feats.append(None)
+                    continue
+                # for each stride of featmap
+                avg_feats = torch.mean(
+                    torch.cat([x[i][j] for x in outs_list]),
+                    dim=0,
+                    keepdim=True)
+                if i == 1:  # regression predictions
+                    # rot/velo/2d det keeps the original
+                    avg_feats[:, 6:, :, :] = \
+                        outs_list[0][i][j][:, 6:, :, :]
+                if i == 2:
+                    # dir_cls keeps the original
+                    avg_feats = outs_list[0][i][j]
+                merged_feats.append(avg_feats)
+            merged_outs.append(merged_feats)
+        merged_outs = tuple(merged_outs)
+
+        bbox_outputs = self.bbox_head.get_bboxes(
+            *merged_outs, img_metas[0], rescale=rescale)
+        if self.bbox_head.pred_bbox2d:
+            from mmdet.core import bbox2result
+            bbox2d_img = [
+                bbox2result(bboxes2d, labels, self.bbox_head.num_classes)
+                for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs
+            ]
+            bbox_outputs = [bbox_outputs[0][:-1]]
+
+        bbox_img = [
+            bbox3d2result(bboxes, scores, labels, attrs)
+            for bboxes, scores, labels, attrs in bbox_outputs
+        ]
+
+        bbox_list = dict()
+        bbox_list.update(img_bbox=bbox_img[0])
+        if self.bbox_head.pred_bbox2d:
+            bbox_list.update(img_bbox2d=bbox2d_img[0])
+
+        return [bbox_list]
+
+    def show_results(self, data, result, out_dir, show=False, score_thr=None):
+        """Results visualization.
+
+        Args:
+            data (list[dict]): Input images and the information of the sample.
+            result (list[dict]): Prediction results.
+            out_dir (str): Output directory of visualization result.
+            show (bool, optional): Determines whether you are
+                going to show result by open3d.
+                Defaults to False.
+            TODO: implement score_thr of single_stage_mono3d.
+            score_thr (float, optional): Score threshold of bounding boxes.
+                Default to None.
+                Not implemented yet, but it is here for unification.
+        """
+        for batch_id in range(len(result)):
+            if isinstance(data['img_metas'][0], DC):
+                img_filename = data['img_metas'][0]._data[0][batch_id][
+                    'filename']
+                cam2img = data['img_metas'][0]._data[0][batch_id]['cam2img']
+            elif mmcv.is_list_of(data['img_metas'][0], dict):
+                img_filename = data['img_metas'][0][batch_id]['filename']
+                cam2img = data['img_metas'][0][batch_id]['cam2img']
+            else:
+                ValueError(
+                    f"Unsupported data type {type(data['img_metas'][0])} "
+                    f'for visualization!')
+            img = mmcv.imread(img_filename)
+            file_name = osp.split(img_filename)[-1].split('.')[0]
+
+            assert out_dir is not None, 'Expect out_dir, got none.'
+
+            pred_bboxes = result[batch_id]['img_bbox']['boxes_3d']
+            assert isinstance(pred_bboxes, CameraInstance3DBoxes), \
+                f'unsupported predicted bbox type {type(pred_bboxes)}'
+
+            show_multi_modality_result(
+                img,
+                None,
+                pred_bboxes,
+                cam2img,
+                out_dir,
+                file_name,
+                'camera',
+                show=show)
diff --git a/mmdet3d/models/detectors/smoke_mono3d.py b/mmdet3d/models/detectors/smoke_mono3d.py
index 241187f..895f0d9 100644
--- a/mmdet3d/models/detectors/smoke_mono3d.py
+++ b/mmdet3d/models/detectors/smoke_mono3d.py
@@ -1,21 +1,21 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ..builder import DETECTORS
-from .single_stage_mono3d import SingleStageMono3DDetector
-
-
-@DETECTORS.register_module()
-class SMOKEMono3D(SingleStageMono3DDetector):
-    r"""SMOKE <https://arxiv.org/abs/2002.10111>`_ for monocular 3D object
-        detection.
-
-    """
-
-    def __init__(self,
-                 backbone,
-                 neck,
-                 bbox_head,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None):
-        super(SMOKEMono3D, self).__init__(backbone, neck, bbox_head, train_cfg,
-                                          test_cfg, pretrained)
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .single_stage_mono3d import SingleStageMono3DDetector
+
+
+@DETECTORS.register_module()
+class SMOKEMono3D(SingleStageMono3DDetector):
+    r"""SMOKE <https://arxiv.org/abs/2002.10111>`_ for monocular 3D object
+        detection.
+
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 bbox_head,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None):
+        super(SMOKEMono3D, self).__init__(backbone, neck, bbox_head, train_cfg,
+                                          test_cfg, pretrained)
diff --git a/mmdet3d/models/detectors/ssd3dnet.py b/mmdet3d/models/detectors/ssd3dnet.py
index fd5e310..2e95c56 100644
--- a/mmdet3d/models/detectors/ssd3dnet.py
+++ b/mmdet3d/models/detectors/ssd3dnet.py
@@ -1,26 +1,26 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ..builder import DETECTORS
-from .votenet import VoteNet
-
-
-@DETECTORS.register_module()
-class SSD3DNet(VoteNet):
-    """3DSSDNet model.
-
-    https://arxiv.org/abs/2002.10187.pdf
-    """
-
-    def __init__(self,
-                 backbone,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 init_cfg=None,
-                 pretrained=None):
-        super(SSD3DNet, self).__init__(
-            backbone=backbone,
-            bbox_head=bbox_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            init_cfg=init_cfg,
-            pretrained=pretrained)
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import DETECTORS
+from .votenet import VoteNet
+
+
+@DETECTORS.register_module()
+class SSD3DNet(VoteNet):
+    """3DSSDNet model.
+
+    https://arxiv.org/abs/2002.10187.pdf
+    """
+
+    def __init__(self,
+                 backbone,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 pretrained=None):
+        super(SSD3DNet, self).__init__(
+            backbone=backbone,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            pretrained=pretrained)
diff --git a/mmdet3d/models/detectors/two_stage.py b/mmdet3d/models/detectors/two_stage.py
index 707f706..790f192 100644
--- a/mmdet3d/models/detectors/two_stage.py
+++ b/mmdet3d/models/detectors/two_stage.py
@@ -1,51 +1,51 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-from mmdet.models import TwoStageDetector
-from ..builder import DETECTORS, build_backbone, build_head, build_neck
-from .base import Base3DDetector
-
-
-@DETECTORS.register_module()
-class TwoStage3DDetector(Base3DDetector, TwoStageDetector):
-    """Base class of two-stage 3D detector.
-
-    It inherits original ``:class:TwoStageDetector`` and
-    ``:class:Base3DDetector``. This class could serve as a base class for all
-    two-stage 3D detectors.
-    """
-
-    def __init__(self,
-                 backbone,
-                 neck=None,
-                 rpn_head=None,
-                 roi_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(TwoStageDetector, self).__init__(init_cfg)
-        if pretrained:
-            warnings.warn('DeprecationWarning: pretrained is deprecated, '
-                          'please use "init_cfg" instead')
-            backbone.pretrained = pretrained
-        self.backbone = build_backbone(backbone)
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        if neck is not None:
-            self.neck = build_neck(neck)
-
-        if rpn_head is not None:
-            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
-            rpn_head_ = rpn_head.copy()
-            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
-            self.rpn_head = build_head(rpn_head_)
-
-        if roi_head is not None:
-            # update train and test cfg here for now
-            # TODO: refactor assigner & sampler
-            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
-            roi_head.update(train_cfg=rcnn_train_cfg)
-            roi_head.update(test_cfg=test_cfg.rcnn)
-            roi_head.pretrained = pretrained
-            self.roi_head = build_head(roi_head)
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmdet.models import TwoStageDetector
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .base import Base3DDetector
+
+
+@DETECTORS.register_module()
+class TwoStage3DDetector(Base3DDetector, TwoStageDetector):
+    """Base class of two-stage 3D detector.
+
+    It inherits original ``:class:TwoStageDetector`` and
+    ``:class:Base3DDetector``. This class could serve as a base class for all
+    two-stage 3D detectors.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 rpn_head=None,
+                 roi_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(TwoStageDetector, self).__init__(init_cfg)
+        if pretrained:
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            backbone.pretrained = pretrained
+        self.backbone = build_backbone(backbone)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if neck is not None:
+            self.neck = build_neck(neck)
+
+        if rpn_head is not None:
+            rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None
+            rpn_head_ = rpn_head.copy()
+            rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn)
+            self.rpn_head = build_head(rpn_head_)
+
+        if roi_head is not None:
+            # update train and test cfg here for now
+            # TODO: refactor assigner & sampler
+            rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None
+            roi_head.update(train_cfg=rcnn_train_cfg)
+            roi_head.update(test_cfg=test_cfg.rcnn)
+            roi_head.pretrained = pretrained
+            self.roi_head = build_head(roi_head)
diff --git a/mmdet3d/models/detectors/votenet.py b/mmdet3d/models/detectors/votenet.py
index 41e4144..fa6ba6c 100644
--- a/mmdet3d/models/detectors/votenet.py
+++ b/mmdet3d/models/detectors/votenet.py
@@ -1,107 +1,107 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
-from ..builder import DETECTORS
-from .single_stage import SingleStage3DDetector
-
-
-@DETECTORS.register_module()
-class VoteNet(SingleStage3DDetector):
-    r"""`VoteNet <https://arxiv.org/pdf/1904.09664.pdf>`_ for 3D detection."""
-
-    def __init__(self,
-                 backbone,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 init_cfg=None,
-                 pretrained=None):
-        super(VoteNet, self).__init__(
-            backbone=backbone,
-            bbox_head=bbox_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            init_cfg=None,
-            pretrained=pretrained)
-
-    def forward_train(self,
-                      points,
-                      img_metas,
-                      gt_bboxes_3d,
-                      gt_labels_3d,
-                      pts_semantic_mask=None,
-                      pts_instance_mask=None,
-                      gt_bboxes_ignore=None):
-        """Forward of training.
-
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            img_metas (list): Image metas.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
-            pts_semantic_mask (list[torch.Tensor]): point-wise semantic
-                label of each batch.
-            pts_instance_mask (list[torch.Tensor]): point-wise instance
-                label of each batch.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding.
-
-        Returns:
-            dict: Losses.
-        """
-        points_cat = torch.stack(points)
-
-        x = self.extract_feat(points_cat)
-        bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod)
-        loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask,
-                       pts_instance_mask, img_metas)
-        losses = self.bbox_head.loss(
-            bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
-        return losses
-
-    def simple_test(self, points, img_metas, imgs=None, rescale=False):
-        """Forward of testing.
-
-        Args:
-            points (list[torch.Tensor]): Points of each sample.
-            img_metas (list): Image metas.
-            rescale (bool): Whether to rescale results.
-
-        Returns:
-            list: Predicted 3d boxes.
-        """
-        points_cat = torch.stack(points)
-
-        x = self.extract_feat(points_cat)
-        bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
-        bbox_list = self.bbox_head.get_bboxes(
-            points_cat, bbox_preds, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def aug_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test with augmentation."""
-        points_cat = [torch.stack(pts) for pts in points]
-        feats = self.extract_feats(points_cat, img_metas)
-
-        # only support aug_test for one sample
-        aug_bboxes = []
-        for x, pts_cat, img_meta in zip(feats, points_cat, img_metas):
-            bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
-            bbox_list = self.bbox_head.get_bboxes(
-                pts_cat, bbox_preds, img_meta, rescale=rescale)
-            bbox_list = [
-                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
-                for bboxes, scores, labels in bbox_list
-            ]
-            aug_bboxes.append(bbox_list[0])
-
-        # after merging, bboxes will be rescaled to the original image size
-        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
-                                            self.bbox_head.test_cfg)
-
-        return [merged_bboxes]
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
+from ..builder import DETECTORS
+from .single_stage import SingleStage3DDetector
+
+
+@DETECTORS.register_module()
+class VoteNet(SingleStage3DDetector):
+    r"""`VoteNet <https://arxiv.org/pdf/1904.09664.pdf>`_ for 3D detection."""
+
+    def __init__(self,
+                 backbone,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 pretrained=None):
+        super(VoteNet, self).__init__(
+            backbone=backbone,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=None,
+            pretrained=pretrained)
+
+    def forward_train(self,
+                      points,
+                      img_metas,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      pts_semantic_mask=None,
+                      pts_instance_mask=None,
+                      gt_bboxes_ignore=None):
+        """Forward of training.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            img_metas (list): Image metas.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
+            gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
+            pts_semantic_mask (list[torch.Tensor]): point-wise semantic
+                label of each batch.
+            pts_instance_mask (list[torch.Tensor]): point-wise instance
+                label of each batch.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding.
+
+        Returns:
+            dict: Losses.
+        """
+        points_cat = torch.stack(points)
+
+        x = self.extract_feat(points_cat)
+        bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod)
+        loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask,
+                       pts_instance_mask, img_metas)
+        losses = self.bbox_head.loss(
+            bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, points, img_metas, imgs=None, rescale=False):
+        """Forward of testing.
+
+        Args:
+            points (list[torch.Tensor]): Points of each sample.
+            img_metas (list): Image metas.
+            rescale (bool): Whether to rescale results.
+
+        Returns:
+            list: Predicted 3d boxes.
+        """
+        points_cat = torch.stack(points)
+
+        x = self.extract_feat(points_cat)
+        bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
+        bbox_list = self.bbox_head.get_bboxes(
+            points_cat, bbox_preds, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def aug_test(self, points, img_metas, imgs=None, rescale=False):
+        """Test with augmentation."""
+        points_cat = [torch.stack(pts) for pts in points]
+        feats = self.extract_feats(points_cat, img_metas)
+
+        # only support aug_test for one sample
+        aug_bboxes = []
+        for x, pts_cat, img_meta in zip(feats, points_cat, img_metas):
+            bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
+            bbox_list = self.bbox_head.get_bboxes(
+                pts_cat, bbox_preds, img_meta, rescale=rescale)
+            bbox_list = [
+                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
+                for bboxes, scores, labels in bbox_list
+            ]
+            aug_bboxes.append(bbox_list[0])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
+                                            self.bbox_head.test_cfg)
+
+        return [merged_bboxes]
diff --git a/mmdet3d/models/detectors/voxelnet.py b/mmdet3d/models/detectors/voxelnet.py
index 9276b7d..8e4d75b 100644
--- a/mmdet3d/models/detectors/voxelnet.py
+++ b/mmdet3d/models/detectors/voxelnet.py
@@ -1,130 +1,130 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.ops import Voxelization
-from mmcv.runner import force_fp32
-from torch.nn import functional as F
-
-from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
-from .. import builder
-from ..builder import DETECTORS
-from .single_stage import SingleStage3DDetector
-
-
-@DETECTORS.register_module()
-class VoxelNet(SingleStage3DDetector):
-    r"""`VoxelNet <https://arxiv.org/abs/1711.06396>`_ for 3D detection."""
-
-    def __init__(self,
-                 voxel_layer,
-                 voxel_encoder,
-                 middle_encoder,
-                 backbone,
-                 neck=None,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 init_cfg=None,
-                 pretrained=None):
-        super(VoxelNet, self).__init__(
-            backbone=backbone,
-            neck=neck,
-            bbox_head=bbox_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            init_cfg=init_cfg,
-            pretrained=pretrained)
-        self.voxel_layer = Voxelization(**voxel_layer)
-        self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder)
-        self.middle_encoder = builder.build_middle_encoder(middle_encoder)
-
-    def extract_feat(self, points, img_metas=None):
-        """Extract features from points."""
-        voxels, num_points, coors = self.voxelize(points)
-        voxel_features = self.voxel_encoder(voxels, num_points, coors)
-        batch_size = coors[-1, 0].item() + 1
-        x = self.middle_encoder(voxel_features, coors, batch_size)
-        x = self.backbone(x)
-        if self.with_neck:
-            x = self.neck(x)
-        return x
-
-    @torch.no_grad()
-    @force_fp32()
-    def voxelize(self, points):
-        """Apply hard voxelization to points."""
-        voxels, coors, num_points = [], [], []
-        for res in points:
-            res_voxels, res_coors, res_num_points = self.voxel_layer(res)
-            voxels.append(res_voxels)
-            coors.append(res_coors)
-            num_points.append(res_num_points)
-        voxels = torch.cat(voxels, dim=0)
-        num_points = torch.cat(num_points, dim=0)
-        coors_batch = []
-        for i, coor in enumerate(coors):
-            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-            coors_batch.append(coor_pad)
-        coors_batch = torch.cat(coors_batch, dim=0)
-        return voxels, num_points, coors_batch
-
-    def forward_train(self,
-                      points,
-                      img_metas,
-                      gt_bboxes_3d,
-                      gt_labels_3d,
-                      gt_bboxes_ignore=None):
-        """Training forward function.
-
-        Args:
-            points (list[torch.Tensor]): Point cloud of each sample.
-            img_metas (list[dict]): Meta information of each sample
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes for each sample.
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
-                boxes of each sampole
-            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
-                boxes to be ignored. Defaults to None.
-
-        Returns:
-            dict: Losses of each branch.
-        """
-        x = self.extract_feat(points, img_metas)
-        outs = self.bbox_head(x)
-        loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas)
-        losses = self.bbox_head.loss(
-            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
-        return losses
-
-    def simple_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test function without augmentaiton."""
-        x = self.extract_feat(points, img_metas)
-        outs = self.bbox_head(x)
-        bbox_list = self.bbox_head.get_bboxes(
-            *outs, img_metas, rescale=rescale)
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def aug_test(self, points, img_metas, imgs=None, rescale=False):
-        """Test function with augmentaiton."""
-        feats = self.extract_feats(points, img_metas)
-
-        # only support aug_test for one sample
-        aug_bboxes = []
-        for x, img_meta in zip(feats, img_metas):
-            outs = self.bbox_head(x)
-            bbox_list = self.bbox_head.get_bboxes(
-                *outs, img_meta, rescale=rescale)
-            bbox_list = [
-                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
-                for bboxes, scores, labels in bbox_list
-            ]
-            aug_bboxes.append(bbox_list[0])
-
-        # after merging, bboxes will be rescaled to the original image size
-        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
-                                            self.bbox_head.test_cfg)
-
-        return [merged_bboxes]
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import Voxelization
+from mmcv.runner import force_fp32
+from torch.nn import functional as F
+
+from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
+from .. import builder
+from ..builder import DETECTORS
+from .single_stage import SingleStage3DDetector
+
+
+@DETECTORS.register_module()
+class VoxelNet(SingleStage3DDetector):
+    r"""`VoxelNet <https://arxiv.org/abs/1711.06396>`_ for 3D detection."""
+
+    def __init__(self,
+                 voxel_layer,
+                 voxel_encoder,
+                 middle_encoder,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 pretrained=None):
+        super(VoxelNet, self).__init__(
+            backbone=backbone,
+            neck=neck,
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg,
+            pretrained=pretrained)
+        self.voxel_layer = Voxelization(**voxel_layer)
+        self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder)
+        self.middle_encoder = builder.build_middle_encoder(middle_encoder)
+
+    def extract_feat(self, points, img_metas=None):
+        """Extract features from points."""
+        voxels, num_points, coors = self.voxelize(points)
+        voxel_features = self.voxel_encoder(voxels, num_points, coors)
+        batch_size = coors[-1, 0].item() + 1
+        x = self.middle_encoder(voxel_features, coors, batch_size)
+        x = self.backbone(x)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    @torch.no_grad()
+    @force_fp32()
+    def voxelize(self, points):
+        """Apply hard voxelization to points."""
+        voxels, coors, num_points = [], [], []
+        for res in points:
+            res_voxels, res_coors, res_num_points = self.voxel_layer(res)
+            voxels.append(res_voxels)
+            coors.append(res_coors)
+            num_points.append(res_num_points)
+        voxels = torch.cat(voxels, dim=0)
+        num_points = torch.cat(num_points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return voxels, num_points, coors_batch
+
+    def forward_train(self,
+                      points,
+                      img_metas,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      gt_bboxes_ignore=None):
+        """Training forward function.
+
+        Args:
+            points (list[torch.Tensor]): Point cloud of each sample.
+            img_metas (list[dict]): Meta information of each sample
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        x = self.extract_feat(points, img_metas)
+        outs = self.bbox_head(x)
+        loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas)
+        losses = self.bbox_head.loss(
+            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, points, img_metas, imgs=None, rescale=False):
+        """Test function without augmentaiton."""
+        x = self.extract_feat(points, img_metas)
+        outs = self.bbox_head(x)
+        bbox_list = self.bbox_head.get_bboxes(
+            *outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def aug_test(self, points, img_metas, imgs=None, rescale=False):
+        """Test function with augmentaiton."""
+        feats = self.extract_feats(points, img_metas)
+
+        # only support aug_test for one sample
+        aug_bboxes = []
+        for x, img_meta in zip(feats, img_metas):
+            outs = self.bbox_head(x)
+            bbox_list = self.bbox_head.get_bboxes(
+                *outs, img_meta, rescale=rescale)
+            bbox_list = [
+                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
+                for bboxes, scores, labels in bbox_list
+            ]
+            aug_bboxes.append(bbox_list[0])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
+                                            self.bbox_head.test_cfg)
+
+        return [merged_bboxes]
diff --git a/mmdet3d/models/fusion_layers/__init__.py b/mmdet3d/models/fusion_layers/__init__.py
index 6df4741..7748c04 100644
--- a/mmdet3d/models/fusion_layers/__init__.py
+++ b/mmdet3d/models/fusion_layers/__init__.py
@@ -1,10 +1,10 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .coord_transform import (apply_3d_transformation, bbox_2d_transform,
-                              coord_2d_transform)
-from .point_fusion import PointFusion
-from .vote_fusion import VoteFusion
-
-__all__ = [
-    'PointFusion', 'VoteFusion', 'apply_3d_transformation',
-    'bbox_2d_transform', 'coord_2d_transform'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .coord_transform import (apply_3d_transformation, bbox_2d_transform,
+                              coord_2d_transform)
+from .point_fusion import PointFusion
+from .vote_fusion import VoteFusion
+
+__all__ = [
+    'PointFusion', 'VoteFusion', 'apply_3d_transformation',
+    'bbox_2d_transform', 'coord_2d_transform'
+]
diff --git a/mmdet3d/models/fusion_layers/coord_transform.py b/mmdet3d/models/fusion_layers/coord_transform.py
index 7cdcac9..f1b2151 100644
--- a/mmdet3d/models/fusion_layers/coord_transform.py
+++ b/mmdet3d/models/fusion_layers/coord_transform.py
@@ -1,216 +1,216 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from functools import partial
-
-import torch
-
-from mmdet3d.core.points import get_points_type
-
-
-def apply_3d_transformation(pcd, coord_type, img_meta, reverse=False):
-    """Apply transformation to input point cloud.
-
-    Args:
-        pcd (torch.Tensor): The point cloud to be transformed.
-        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
-        img_meta(dict): Meta info regarding data transformation.
-        reverse (bool): Reversed transformation or not.
-
-    Note:
-        The elements in img_meta['transformation_3d_flow']:
-        "T" stands for translation;
-        "S" stands for scale;
-        "R" stands for rotation;
-        "HF" stands for horizontal flip;
-        "VF" stands for vertical flip.
-
-    Returns:
-        torch.Tensor: The transformed point cloud.
-    """
-
-    dtype = pcd.dtype
-    device = pcd.device
-
-    pcd_rotate_mat = (
-        torch.tensor(img_meta['pcd_rotation'], dtype=dtype, device=device)
-        if 'pcd_rotation' in img_meta else torch.eye(
-            3, dtype=dtype, device=device))
-
-    pcd_scale_factor = (
-        img_meta['pcd_scale_factor'] if 'pcd_scale_factor' in img_meta else 1.)
-
-    pcd_trans_factor = (
-        torch.tensor(img_meta['pcd_trans'], dtype=dtype, device=device)
-        if 'pcd_trans' in img_meta else torch.zeros(
-            (3), dtype=dtype, device=device))
-
-    pcd_horizontal_flip = img_meta[
-        'pcd_horizontal_flip'] if 'pcd_horizontal_flip' in \
-        img_meta else False
-
-    pcd_vertical_flip = img_meta[
-        'pcd_vertical_flip'] if 'pcd_vertical_flip' in \
-        img_meta else False
-
-    flow = img_meta['transformation_3d_flow'] \
-        if 'transformation_3d_flow' in img_meta else []
-
-    pcd = pcd.clone()  # prevent inplace modification
-    pcd = get_points_type(coord_type)(pcd)
-
-    horizontal_flip_func = partial(pcd.flip, bev_direction='horizontal') \
-        if pcd_horizontal_flip else lambda: None
-    vertical_flip_func = partial(pcd.flip, bev_direction='vertical') \
-        if pcd_vertical_flip else lambda: None
-    if reverse:
-        scale_func = partial(pcd.scale, scale_factor=1.0 / pcd_scale_factor)
-        translate_func = partial(pcd.translate, trans_vector=-pcd_trans_factor)
-        # pcd_rotate_mat @ pcd_rotate_mat.inverse() is not
-        # exactly an identity matrix
-        # use angle to create the inverse rot matrix neither.
-        rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat.inverse())
-
-        # reverse the pipeline
-        flow = flow[::-1]
-    else:
-        scale_func = partial(pcd.scale, scale_factor=pcd_scale_factor)
-        translate_func = partial(pcd.translate, trans_vector=pcd_trans_factor)
-        rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat)
-
-    flow_mapping = {
-        'T': translate_func,
-        'S': scale_func,
-        'R': rotate_func,
-        'HF': horizontal_flip_func,
-        'VF': vertical_flip_func
-    }
-    for op in flow:
-        assert op in flow_mapping, f'This 3D data '\
-            f'transformation op ({op}) is not supported'
-        func = flow_mapping[op]
-        func()
-
-    return pcd.coord
-
-
-def extract_2d_info(img_meta, tensor):
-    """Extract image augmentation information from img_meta.
-
-    Args:
-        img_meta(dict): Meta info regarding data transformation.
-        tensor(torch.Tensor): Input tensor used to create new ones.
-
-    Returns:
-        (int, int, int, int, torch.Tensor, bool, torch.Tensor):
-            The extracted information.
-    """
-    img_shape = img_meta['img_shape']
-    ori_shape = img_meta['ori_shape']
-    img_h, img_w, _ = img_shape
-    ori_h, ori_w, _ = ori_shape
-
-    img_scale_factor = (
-        tensor.new_tensor(img_meta['scale_factor'][:2])
-        if 'scale_factor' in img_meta else tensor.new_tensor([1.0, 1.0]))
-    img_flip = img_meta['flip'] if 'flip' in img_meta else False
-    img_crop_offset = (
-        tensor.new_tensor(img_meta['img_crop_offset'])
-        if 'img_crop_offset' in img_meta else tensor.new_tensor([0.0, 0.0]))
-
-    return (img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip,
-            img_crop_offset)
-
-
-def bbox_2d_transform(img_meta, bbox_2d, ori2new):
-    """Transform 2d bbox according to img_meta.
-
-    Args:
-        img_meta(dict): Meta info regarding data transformation.
-        bbox_2d (torch.Tensor): Shape (..., >4)
-            The input 2d bboxes to transform.
-        ori2new (bool): Origin img coord system to new or not.
-
-    Returns:
-        torch.Tensor: The transformed 2d bboxes.
-    """
-
-    img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
-        img_crop_offset = extract_2d_info(img_meta, bbox_2d)
-
-    bbox_2d_new = bbox_2d.clone()
-
-    if ori2new:
-        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] * img_scale_factor[0]
-        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] * img_scale_factor[0]
-        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] * img_scale_factor[1]
-        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] * img_scale_factor[1]
-
-        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] + img_crop_offset[0]
-        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] + img_crop_offset[0]
-        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] + img_crop_offset[1]
-        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] + img_crop_offset[1]
-
-        if img_flip:
-            bbox_2d_r = img_w - bbox_2d_new[:, 0]
-            bbox_2d_l = img_w - bbox_2d_new[:, 2]
-            bbox_2d_new[:, 0] = bbox_2d_l
-            bbox_2d_new[:, 2] = bbox_2d_r
-    else:
-        if img_flip:
-            bbox_2d_r = img_w - bbox_2d_new[:, 0]
-            bbox_2d_l = img_w - bbox_2d_new[:, 2]
-            bbox_2d_new[:, 0] = bbox_2d_l
-            bbox_2d_new[:, 2] = bbox_2d_r
-
-        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] - img_crop_offset[0]
-        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] - img_crop_offset[0]
-        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] - img_crop_offset[1]
-        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] - img_crop_offset[1]
-
-        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] / img_scale_factor[0]
-        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] / img_scale_factor[0]
-        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] / img_scale_factor[1]
-        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] / img_scale_factor[1]
-
-    return bbox_2d_new
-
-
-def coord_2d_transform(img_meta, coord_2d, ori2new):
-    """Transform 2d pixel coordinates according to img_meta.
-
-    Args:
-        img_meta(dict): Meta info regarding data transformation.
-        coord_2d (torch.Tensor): Shape (..., 2)
-            The input 2d coords to transform.
-        ori2new (bool): Origin img coord system to new or not.
-
-    Returns:
-        torch.Tensor: The transformed 2d coordinates.
-    """
-
-    img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
-        img_crop_offset = extract_2d_info(img_meta, coord_2d)
-
-    coord_2d_new = coord_2d.clone()
-
-    if ori2new:
-        # TODO here we assume this order of transformation
-        coord_2d_new[..., 0] = coord_2d_new[..., 0] * img_scale_factor[0]
-        coord_2d_new[..., 1] = coord_2d_new[..., 1] * img_scale_factor[1]
-
-        coord_2d_new[..., 0] += img_crop_offset[0]
-        coord_2d_new[..., 1] += img_crop_offset[1]
-
-        # flip uv coordinates and bbox
-        if img_flip:
-            coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0]
-    else:
-        if img_flip:
-            coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0]
-
-        coord_2d_new[..., 0] -= img_crop_offset[0]
-        coord_2d_new[..., 1] -= img_crop_offset[1]
-
-        coord_2d_new[..., 0] = coord_2d_new[..., 0] / img_scale_factor[0]
-        coord_2d_new[..., 1] = coord_2d_new[..., 1] / img_scale_factor[1]
-
-    return coord_2d_new
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+
+import torch
+
+from mmdet3d.core.points import get_points_type
+
+
+def apply_3d_transformation(pcd, coord_type, img_meta, reverse=False):
+    """Apply transformation to input point cloud.
+
+    Args:
+        pcd (torch.Tensor): The point cloud to be transformed.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
+        img_meta(dict): Meta info regarding data transformation.
+        reverse (bool): Reversed transformation or not.
+
+    Note:
+        The elements in img_meta['transformation_3d_flow']:
+        "T" stands for translation;
+        "S" stands for scale;
+        "R" stands for rotation;
+        "HF" stands for horizontal flip;
+        "VF" stands for vertical flip.
+
+    Returns:
+        torch.Tensor: The transformed point cloud.
+    """
+
+    dtype = pcd.dtype
+    device = pcd.device
+
+    pcd_rotate_mat = (
+        torch.tensor(img_meta['pcd_rotation'], dtype=dtype, device=device)
+        if 'pcd_rotation' in img_meta else torch.eye(
+            3, dtype=dtype, device=device))
+
+    pcd_scale_factor = (
+        img_meta['pcd_scale_factor'] if 'pcd_scale_factor' in img_meta else 1.)
+
+    pcd_trans_factor = (
+        torch.tensor(img_meta['pcd_trans'], dtype=dtype, device=device)
+        if 'pcd_trans' in img_meta else torch.zeros(
+            (3), dtype=dtype, device=device))
+
+    pcd_horizontal_flip = img_meta[
+        'pcd_horizontal_flip'] if 'pcd_horizontal_flip' in \
+        img_meta else False
+
+    pcd_vertical_flip = img_meta[
+        'pcd_vertical_flip'] if 'pcd_vertical_flip' in \
+        img_meta else False
+
+    flow = img_meta['transformation_3d_flow'] \
+        if 'transformation_3d_flow' in img_meta else []
+
+    pcd = pcd.clone()  # prevent inplace modification
+    pcd = get_points_type(coord_type)(pcd)
+
+    horizontal_flip_func = partial(pcd.flip, bev_direction='horizontal') \
+        if pcd_horizontal_flip else lambda: None
+    vertical_flip_func = partial(pcd.flip, bev_direction='vertical') \
+        if pcd_vertical_flip else lambda: None
+    if reverse:
+        scale_func = partial(pcd.scale, scale_factor=1.0 / pcd_scale_factor)
+        translate_func = partial(pcd.translate, trans_vector=-pcd_trans_factor)
+        # pcd_rotate_mat @ pcd_rotate_mat.inverse() is not
+        # exactly an identity matrix
+        # use angle to create the inverse rot matrix neither.
+        rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat.inverse())
+
+        # reverse the pipeline
+        flow = flow[::-1]
+    else:
+        scale_func = partial(pcd.scale, scale_factor=pcd_scale_factor)
+        translate_func = partial(pcd.translate, trans_vector=pcd_trans_factor)
+        rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat)
+
+    flow_mapping = {
+        'T': translate_func,
+        'S': scale_func,
+        'R': rotate_func,
+        'HF': horizontal_flip_func,
+        'VF': vertical_flip_func
+    }
+    for op in flow:
+        assert op in flow_mapping, f'This 3D data '\
+            f'transformation op ({op}) is not supported'
+        func = flow_mapping[op]
+        func()
+
+    return pcd.coord
+
+
+def extract_2d_info(img_meta, tensor):
+    """Extract image augmentation information from img_meta.
+
+    Args:
+        img_meta(dict): Meta info regarding data transformation.
+        tensor(torch.Tensor): Input tensor used to create new ones.
+
+    Returns:
+        (int, int, int, int, torch.Tensor, bool, torch.Tensor):
+            The extracted information.
+    """
+    img_shape = img_meta['img_shape']
+    ori_shape = img_meta['ori_shape']
+    img_h, img_w, _ = img_shape
+    ori_h, ori_w, _ = ori_shape
+
+    img_scale_factor = (
+        tensor.new_tensor(img_meta['scale_factor'][:2])
+        if 'scale_factor' in img_meta else tensor.new_tensor([1.0, 1.0]))
+    img_flip = img_meta['flip'] if 'flip' in img_meta else False
+    img_crop_offset = (
+        tensor.new_tensor(img_meta['img_crop_offset'])
+        if 'img_crop_offset' in img_meta else tensor.new_tensor([0.0, 0.0]))
+
+    return (img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip,
+            img_crop_offset)
+
+
+def bbox_2d_transform(img_meta, bbox_2d, ori2new):
+    """Transform 2d bbox according to img_meta.
+
+    Args:
+        img_meta(dict): Meta info regarding data transformation.
+        bbox_2d (torch.Tensor): Shape (..., >4)
+            The input 2d bboxes to transform.
+        ori2new (bool): Origin img coord system to new or not.
+
+    Returns:
+        torch.Tensor: The transformed 2d bboxes.
+    """
+
+    img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
+        img_crop_offset = extract_2d_info(img_meta, bbox_2d)
+
+    bbox_2d_new = bbox_2d.clone()
+
+    if ori2new:
+        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] * img_scale_factor[0]
+        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] * img_scale_factor[0]
+        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] * img_scale_factor[1]
+        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] * img_scale_factor[1]
+
+        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] + img_crop_offset[0]
+        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] + img_crop_offset[0]
+        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] + img_crop_offset[1]
+        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] + img_crop_offset[1]
+
+        if img_flip:
+            bbox_2d_r = img_w - bbox_2d_new[:, 0]
+            bbox_2d_l = img_w - bbox_2d_new[:, 2]
+            bbox_2d_new[:, 0] = bbox_2d_l
+            bbox_2d_new[:, 2] = bbox_2d_r
+    else:
+        if img_flip:
+            bbox_2d_r = img_w - bbox_2d_new[:, 0]
+            bbox_2d_l = img_w - bbox_2d_new[:, 2]
+            bbox_2d_new[:, 0] = bbox_2d_l
+            bbox_2d_new[:, 2] = bbox_2d_r
+
+        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] - img_crop_offset[0]
+        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] - img_crop_offset[0]
+        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] - img_crop_offset[1]
+        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] - img_crop_offset[1]
+
+        bbox_2d_new[:, 0] = bbox_2d_new[:, 0] / img_scale_factor[0]
+        bbox_2d_new[:, 2] = bbox_2d_new[:, 2] / img_scale_factor[0]
+        bbox_2d_new[:, 1] = bbox_2d_new[:, 1] / img_scale_factor[1]
+        bbox_2d_new[:, 3] = bbox_2d_new[:, 3] / img_scale_factor[1]
+
+    return bbox_2d_new
+
+
+def coord_2d_transform(img_meta, coord_2d, ori2new):
+    """Transform 2d pixel coordinates according to img_meta.
+
+    Args:
+        img_meta(dict): Meta info regarding data transformation.
+        coord_2d (torch.Tensor): Shape (..., 2)
+            The input 2d coords to transform.
+        ori2new (bool): Origin img coord system to new or not.
+
+    Returns:
+        torch.Tensor: The transformed 2d coordinates.
+    """
+
+    img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
+        img_crop_offset = extract_2d_info(img_meta, coord_2d)
+
+    coord_2d_new = coord_2d.clone()
+
+    if ori2new:
+        # TODO here we assume this order of transformation
+        coord_2d_new[..., 0] = coord_2d_new[..., 0] * img_scale_factor[0]
+        coord_2d_new[..., 1] = coord_2d_new[..., 1] * img_scale_factor[1]
+
+        coord_2d_new[..., 0] += img_crop_offset[0]
+        coord_2d_new[..., 1] += img_crop_offset[1]
+
+        # flip uv coordinates and bbox
+        if img_flip:
+            coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0]
+    else:
+        if img_flip:
+            coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0]
+
+        coord_2d_new[..., 0] -= img_crop_offset[0]
+        coord_2d_new[..., 1] -= img_crop_offset[1]
+
+        coord_2d_new[..., 0] = coord_2d_new[..., 0] / img_scale_factor[0]
+        coord_2d_new[..., 1] = coord_2d_new[..., 1] / img_scale_factor[1]
+
+    return coord_2d_new
diff --git a/mmdet3d/models/fusion_layers/point_fusion.py b/mmdet3d/models/fusion_layers/point_fusion.py
index 97b4177..6441366 100644
--- a/mmdet3d/models/fusion_layers/point_fusion.py
+++ b/mmdet3d/models/fusion_layers/point_fusion.py
@@ -1,306 +1,306 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule
-from torch import nn as nn
-from torch.nn import functional as F
-
-from mmdet3d.core.bbox.structures import (get_proj_mat_by_coord_type,
-                                          points_cam2img)
-from ..builder import FUSION_LAYERS
-from . import apply_3d_transformation
-
-
-def point_sample(img_meta,
-                 img_features,
-                 points,
-                 proj_mat,
-                 coord_type,
-                 img_scale_factor,
-                 img_crop_offset,
-                 img_flip,
-                 img_pad_shape,
-                 img_shape,
-                 aligned=True,
-                 padding_mode='zeros',
-                 align_corners=True):
-    """Obtain image features using points.
-
-    Args:
-        img_meta (dict): Meta info.
-        img_features (torch.Tensor): 1 x C x H x W image features.
-        points (torch.Tensor): Nx3 point cloud in LiDAR coordinates.
-        proj_mat (torch.Tensor): 4x4 transformation matrix.
-        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
-        img_scale_factor (torch.Tensor): Scale factor with shape of
-            (w_scale, h_scale).
-        img_crop_offset (torch.Tensor): Crop offset used to crop
-            image during data augmentation with shape of (w_offset, h_offset).
-        img_flip (bool): Whether the image is flipped.
-        img_pad_shape (tuple[int]): int tuple indicates the h & w after
-            padding, this is necessary to obtain features in feature map.
-        img_shape (tuple[int]): int tuple indicates the h & w before padding
-            after scaling, this is necessary for flipping coordinates.
-        aligned (bool, optional): Whether use bilinear interpolation when
-            sampling image features for each point. Defaults to True.
-        padding_mode (str, optional): Padding mode when padding values for
-            features of out-of-image points. Defaults to 'zeros'.
-        align_corners (bool, optional): Whether to align corners when
-            sampling image features for each point. Defaults to True.
-
-    Returns:
-        torch.Tensor: NxC image features sampled by point coordinates.
-    """
-
-    # apply transformation based on info in img_meta
-    points = apply_3d_transformation(
-        points, coord_type, img_meta, reverse=True)
-
-    # project points to camera coordinate
-    pts_2d = points_cam2img(points, proj_mat)
-
-    # img transformation: scale -> crop -> flip
-    # the image is resized by img_scale_factor
-    img_coors = pts_2d[:, 0:2] * img_scale_factor  # Nx2
-    img_coors -= img_crop_offset
-
-    # grid sample, the valid grid range should be in [-1,1]
-    coor_x, coor_y = torch.split(img_coors, 1, dim=1)  # each is Nx1
-
-    if img_flip:
-        # by default we take it as horizontal flip
-        # use img_shape before padding for flip
-        orig_h, orig_w = img_shape
-        coor_x = orig_w - coor_x
-
-    h, w = img_pad_shape
-    coor_y = coor_y / h * 2 - 1
-    coor_x = coor_x / w * 2 - 1
-    grid = torch.cat([coor_x, coor_y],
-                     dim=1).unsqueeze(0).unsqueeze(0)  # Nx2 -> 1x1xNx2
-
-    # align_corner=True provides higher performance
-    mode = 'bilinear' if aligned else 'nearest'
-    point_features = F.grid_sample(
-        img_features,
-        grid,
-        mode=mode,
-        padding_mode=padding_mode,
-        align_corners=align_corners)  # 1xCx1xN feats
-
-    return point_features.squeeze().t()
-
-
-@FUSION_LAYERS.register_module()
-class PointFusion(BaseModule):
-    """Fuse image features from multi-scale features.
-
-    Args:
-        img_channels (list[int] | int): Channels of image features.
-            It could be a list if the input is multi-scale image features.
-        pts_channels (int): Channels of point features
-        mid_channels (int): Channels of middle layers
-        out_channels (int): Channels of output fused features
-        img_levels (int, optional): Number of image levels. Defaults to 3.
-        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
-            Defaults to 'LIDAR'.
-        conv_cfg (dict, optional): Dict config of conv layers of middle
-            layers. Defaults to None.
-        norm_cfg (dict, optional): Dict config of norm layers of middle
-            layers. Defaults to None.
-        act_cfg (dict, optional): Dict config of activatation layers.
-            Defaults to None.
-        activate_out (bool, optional): Whether to apply relu activation
-            to output features. Defaults to True.
-        fuse_out (bool, optional): Whether apply conv layer to the fused
-            features. Defaults to False.
-        dropout_ratio (int, float, optional): Dropout ratio of image
-            features to prevent overfitting. Defaults to 0.
-        aligned (bool, optional): Whether apply aligned feature fusion.
-            Defaults to True.
-        align_corners (bool, optional): Whether to align corner when
-            sampling features according to points. Defaults to True.
-        padding_mode (str, optional): Mode used to pad the features of
-            points that do not have corresponding image features.
-            Defaults to 'zeros'.
-        lateral_conv (bool, optional): Whether to apply lateral convs
-            to image features. Defaults to True.
-    """
-
-    def __init__(self,
-                 img_channels,
-                 pts_channels,
-                 mid_channels,
-                 out_channels,
-                 img_levels=3,
-                 coord_type='LIDAR',
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=None,
-                 init_cfg=None,
-                 activate_out=True,
-                 fuse_out=False,
-                 dropout_ratio=0,
-                 aligned=True,
-                 align_corners=True,
-                 padding_mode='zeros',
-                 lateral_conv=True):
-        super(PointFusion, self).__init__(init_cfg=init_cfg)
-        if isinstance(img_levels, int):
-            img_levels = [img_levels]
-        if isinstance(img_channels, int):
-            img_channels = [img_channels] * len(img_levels)
-        assert isinstance(img_levels, list)
-        assert isinstance(img_channels, list)
-        assert len(img_channels) == len(img_levels)
-
-        self.img_levels = img_levels
-        self.coord_type = coord_type
-        self.act_cfg = act_cfg
-        self.activate_out = activate_out
-        self.fuse_out = fuse_out
-        self.dropout_ratio = dropout_ratio
-        self.img_channels = img_channels
-        self.aligned = aligned
-        self.align_corners = align_corners
-        self.padding_mode = padding_mode
-
-        self.lateral_convs = None
-        if lateral_conv:
-            self.lateral_convs = nn.ModuleList()
-            for i in range(len(img_channels)):
-                l_conv = ConvModule(
-                    img_channels[i],
-                    mid_channels,
-                    3,
-                    padding=1,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=self.act_cfg,
-                    inplace=False)
-                self.lateral_convs.append(l_conv)
-            self.img_transform = nn.Sequential(
-                nn.Linear(mid_channels * len(img_channels), out_channels),
-                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
-            )
-        else:
-            self.img_transform = nn.Sequential(
-                nn.Linear(sum(img_channels), out_channels),
-                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
-            )
-        self.pts_transform = nn.Sequential(
-            nn.Linear(pts_channels, out_channels),
-            nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
-        )
-
-        if self.fuse_out:
-            self.fuse_conv = nn.Sequential(
-                nn.Linear(mid_channels, out_channels),
-                # For pts the BN is initialized differently by default
-                # TODO: check whether this is necessary
-                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
-                nn.ReLU(inplace=False))
-
-        if init_cfg is None:
-            self.init_cfg = [
-                dict(type='Xavier', layer='Conv2d', distribution='uniform'),
-                dict(type='Xavier', layer='Linear', distribution='uniform')
-            ]
-
-    def forward(self, img_feats, pts, pts_feats, img_metas):
-        """Forward function.
-
-        Args:
-            img_feats (list[torch.Tensor]): Image features.
-            pts: [list[torch.Tensor]]: A batch of points with shape N x 3.
-            pts_feats (torch.Tensor): A tensor consist of point features of the
-                total batch.
-            img_metas (list[dict]): Meta information of images.
-
-        Returns:
-            torch.Tensor: Fused features of each point.
-        """
-        img_pts = self.obtain_mlvl_feats(img_feats, pts, img_metas)
-        img_pre_fuse = self.img_transform(img_pts)
-        if self.training and self.dropout_ratio > 0:
-            img_pre_fuse = F.dropout(img_pre_fuse, self.dropout_ratio)
-        pts_pre_fuse = self.pts_transform(pts_feats)
-
-        fuse_out = img_pre_fuse + pts_pre_fuse
-        if self.activate_out:
-            fuse_out = F.relu(fuse_out)
-        if self.fuse_out:
-            fuse_out = self.fuse_conv(fuse_out)
-
-        return fuse_out
-
-    def obtain_mlvl_feats(self, img_feats, pts, img_metas):
-        """Obtain multi-level features for each point.
-
-        Args:
-            img_feats (list(torch.Tensor)): Multi-scale image features produced
-                by image backbone in shape (N, C, H, W).
-            pts (list[torch.Tensor]): Points of each sample.
-            img_metas (list[dict]): Meta information for each sample.
-
-        Returns:
-            torch.Tensor: Corresponding image features of each point.
-        """
-        if self.lateral_convs is not None:
-            img_ins = [
-                lateral_conv(img_feats[i])
-                for i, lateral_conv in zip(self.img_levels, self.lateral_convs)
-            ]
-        else:
-            img_ins = img_feats
-        img_feats_per_point = []
-        # Sample multi-level features
-        for i in range(len(img_metas)):
-            mlvl_img_feats = []
-            for level in range(len(self.img_levels)):
-                mlvl_img_feats.append(
-                    self.sample_single(img_ins[level][i:i + 1], pts[i][:, :3],
-                                       img_metas[i]))
-            mlvl_img_feats = torch.cat(mlvl_img_feats, dim=-1)
-            img_feats_per_point.append(mlvl_img_feats)
-
-        img_pts = torch.cat(img_feats_per_point, dim=0)
-        return img_pts
-
-    def sample_single(self, img_feats, pts, img_meta):
-        """Sample features from single level image feature map.
-
-        Args:
-            img_feats (torch.Tensor): Image feature map in shape
-                (1, C, H, W).
-            pts (torch.Tensor): Points of a single sample.
-            img_meta (dict): Meta information of the single sample.
-
-        Returns:
-            torch.Tensor: Single level image features of each point.
-        """
-        # TODO: image transformation also extracted
-        img_scale_factor = (
-            pts.new_tensor(img_meta['scale_factor'][:2])
-            if 'scale_factor' in img_meta.keys() else 1)
-        img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False
-        img_crop_offset = (
-            pts.new_tensor(img_meta['img_crop_offset'])
-            if 'img_crop_offset' in img_meta.keys() else 0)
-        proj_mat = get_proj_mat_by_coord_type(img_meta, self.coord_type)
-        img_pts = point_sample(
-            img_meta=img_meta,
-            img_features=img_feats,
-            points=pts,
-            proj_mat=pts.new_tensor(proj_mat),
-            coord_type=self.coord_type,
-            img_scale_factor=img_scale_factor,
-            img_crop_offset=img_crop_offset,
-            img_flip=img_flip,
-            img_pad_shape=img_meta['input_shape'][:2],
-            img_shape=img_meta['img_shape'][:2],
-            aligned=self.aligned,
-            padding_mode=self.padding_mode,
-            align_corners=self.align_corners,
-        )
-        return img_pts
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.core.bbox.structures import (get_proj_mat_by_coord_type,
+                                          points_cam2img)
+from ..builder import FUSION_LAYERS
+from . import apply_3d_transformation
+
+
+def point_sample(img_meta,
+                 img_features,
+                 points,
+                 proj_mat,
+                 coord_type,
+                 img_scale_factor,
+                 img_crop_offset,
+                 img_flip,
+                 img_pad_shape,
+                 img_shape,
+                 aligned=True,
+                 padding_mode='zeros',
+                 align_corners=True):
+    """Obtain image features using points.
+
+    Args:
+        img_meta (dict): Meta info.
+        img_features (torch.Tensor): 1 x C x H x W image features.
+        points (torch.Tensor): Nx3 point cloud in LiDAR coordinates.
+        proj_mat (torch.Tensor): 4x4 transformation matrix.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
+        img_scale_factor (torch.Tensor): Scale factor with shape of
+            (w_scale, h_scale).
+        img_crop_offset (torch.Tensor): Crop offset used to crop
+            image during data augmentation with shape of (w_offset, h_offset).
+        img_flip (bool): Whether the image is flipped.
+        img_pad_shape (tuple[int]): int tuple indicates the h & w after
+            padding, this is necessary to obtain features in feature map.
+        img_shape (tuple[int]): int tuple indicates the h & w before padding
+            after scaling, this is necessary for flipping coordinates.
+        aligned (bool, optional): Whether use bilinear interpolation when
+            sampling image features for each point. Defaults to True.
+        padding_mode (str, optional): Padding mode when padding values for
+            features of out-of-image points. Defaults to 'zeros'.
+        align_corners (bool, optional): Whether to align corners when
+            sampling image features for each point. Defaults to True.
+
+    Returns:
+        torch.Tensor: NxC image features sampled by point coordinates.
+    """
+
+    # apply transformation based on info in img_meta
+    points = apply_3d_transformation(
+        points, coord_type, img_meta, reverse=True)
+
+    # project points to camera coordinate
+    pts_2d = points_cam2img(points, proj_mat)
+
+    # img transformation: scale -> crop -> flip
+    # the image is resized by img_scale_factor
+    img_coors = pts_2d[:, 0:2] * img_scale_factor  # Nx2
+    img_coors -= img_crop_offset
+
+    # grid sample, the valid grid range should be in [-1,1]
+    coor_x, coor_y = torch.split(img_coors, 1, dim=1)  # each is Nx1
+
+    if img_flip:
+        # by default we take it as horizontal flip
+        # use img_shape before padding for flip
+        orig_h, orig_w = img_shape
+        coor_x = orig_w - coor_x
+
+    h, w = img_pad_shape
+    coor_y = coor_y / h * 2 - 1
+    coor_x = coor_x / w * 2 - 1
+    grid = torch.cat([coor_x, coor_y],
+                     dim=1).unsqueeze(0).unsqueeze(0)  # Nx2 -> 1x1xNx2
+
+    # align_corner=True provides higher performance
+    mode = 'bilinear' if aligned else 'nearest'
+    point_features = F.grid_sample(
+        img_features,
+        grid,
+        mode=mode,
+        padding_mode=padding_mode,
+        align_corners=align_corners)  # 1xCx1xN feats
+
+    return point_features.squeeze().t()
+
+
+@FUSION_LAYERS.register_module()
+class PointFusion(BaseModule):
+    """Fuse image features from multi-scale features.
+
+    Args:
+        img_channels (list[int] | int): Channels of image features.
+            It could be a list if the input is multi-scale image features.
+        pts_channels (int): Channels of point features
+        mid_channels (int): Channels of middle layers
+        out_channels (int): Channels of output fused features
+        img_levels (int, optional): Number of image levels. Defaults to 3.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
+            Defaults to 'LIDAR'.
+        conv_cfg (dict, optional): Dict config of conv layers of middle
+            layers. Defaults to None.
+        norm_cfg (dict, optional): Dict config of norm layers of middle
+            layers. Defaults to None.
+        act_cfg (dict, optional): Dict config of activatation layers.
+            Defaults to None.
+        activate_out (bool, optional): Whether to apply relu activation
+            to output features. Defaults to True.
+        fuse_out (bool, optional): Whether apply conv layer to the fused
+            features. Defaults to False.
+        dropout_ratio (int, float, optional): Dropout ratio of image
+            features to prevent overfitting. Defaults to 0.
+        aligned (bool, optional): Whether apply aligned feature fusion.
+            Defaults to True.
+        align_corners (bool, optional): Whether to align corner when
+            sampling features according to points. Defaults to True.
+        padding_mode (str, optional): Mode used to pad the features of
+            points that do not have corresponding image features.
+            Defaults to 'zeros'.
+        lateral_conv (bool, optional): Whether to apply lateral convs
+            to image features. Defaults to True.
+    """
+
+    def __init__(self,
+                 img_channels,
+                 pts_channels,
+                 mid_channels,
+                 out_channels,
+                 img_levels=3,
+                 coord_type='LIDAR',
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 init_cfg=None,
+                 activate_out=True,
+                 fuse_out=False,
+                 dropout_ratio=0,
+                 aligned=True,
+                 align_corners=True,
+                 padding_mode='zeros',
+                 lateral_conv=True):
+        super(PointFusion, self).__init__(init_cfg=init_cfg)
+        if isinstance(img_levels, int):
+            img_levels = [img_levels]
+        if isinstance(img_channels, int):
+            img_channels = [img_channels] * len(img_levels)
+        assert isinstance(img_levels, list)
+        assert isinstance(img_channels, list)
+        assert len(img_channels) == len(img_levels)
+
+        self.img_levels = img_levels
+        self.coord_type = coord_type
+        self.act_cfg = act_cfg
+        self.activate_out = activate_out
+        self.fuse_out = fuse_out
+        self.dropout_ratio = dropout_ratio
+        self.img_channels = img_channels
+        self.aligned = aligned
+        self.align_corners = align_corners
+        self.padding_mode = padding_mode
+
+        self.lateral_convs = None
+        if lateral_conv:
+            self.lateral_convs = nn.ModuleList()
+            for i in range(len(img_channels)):
+                l_conv = ConvModule(
+                    img_channels[i],
+                    mid_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=self.act_cfg,
+                    inplace=False)
+                self.lateral_convs.append(l_conv)
+            self.img_transform = nn.Sequential(
+                nn.Linear(mid_channels * len(img_channels), out_channels),
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+            )
+        else:
+            self.img_transform = nn.Sequential(
+                nn.Linear(sum(img_channels), out_channels),
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+            )
+        self.pts_transform = nn.Sequential(
+            nn.Linear(pts_channels, out_channels),
+            nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+        )
+
+        if self.fuse_out:
+            self.fuse_conv = nn.Sequential(
+                nn.Linear(mid_channels, out_channels),
+                # For pts the BN is initialized differently by default
+                # TODO: check whether this is necessary
+                nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
+                nn.ReLU(inplace=False))
+
+        if init_cfg is None:
+            self.init_cfg = [
+                dict(type='Xavier', layer='Conv2d', distribution='uniform'),
+                dict(type='Xavier', layer='Linear', distribution='uniform')
+            ]
+
+    def forward(self, img_feats, pts, pts_feats, img_metas):
+        """Forward function.
+
+        Args:
+            img_feats (list[torch.Tensor]): Image features.
+            pts: [list[torch.Tensor]]: A batch of points with shape N x 3.
+            pts_feats (torch.Tensor): A tensor consist of point features of the
+                total batch.
+            img_metas (list[dict]): Meta information of images.
+
+        Returns:
+            torch.Tensor: Fused features of each point.
+        """
+        img_pts = self.obtain_mlvl_feats(img_feats, pts, img_metas)
+        img_pre_fuse = self.img_transform(img_pts)
+        if self.training and self.dropout_ratio > 0:
+            img_pre_fuse = F.dropout(img_pre_fuse, self.dropout_ratio)
+        pts_pre_fuse = self.pts_transform(pts_feats)
+
+        fuse_out = img_pre_fuse + pts_pre_fuse
+        if self.activate_out:
+            fuse_out = F.relu(fuse_out)
+        if self.fuse_out:
+            fuse_out = self.fuse_conv(fuse_out)
+
+        return fuse_out
+
+    def obtain_mlvl_feats(self, img_feats, pts, img_metas):
+        """Obtain multi-level features for each point.
+
+        Args:
+            img_feats (list(torch.Tensor)): Multi-scale image features produced
+                by image backbone in shape (N, C, H, W).
+            pts (list[torch.Tensor]): Points of each sample.
+            img_metas (list[dict]): Meta information for each sample.
+
+        Returns:
+            torch.Tensor: Corresponding image features of each point.
+        """
+        if self.lateral_convs is not None:
+            img_ins = [
+                lateral_conv(img_feats[i])
+                for i, lateral_conv in zip(self.img_levels, self.lateral_convs)
+            ]
+        else:
+            img_ins = img_feats
+        img_feats_per_point = []
+        # Sample multi-level features
+        for i in range(len(img_metas)):
+            mlvl_img_feats = []
+            for level in range(len(self.img_levels)):
+                mlvl_img_feats.append(
+                    self.sample_single(img_ins[level][i:i + 1], pts[i][:, :3],
+                                       img_metas[i]))
+            mlvl_img_feats = torch.cat(mlvl_img_feats, dim=-1)
+            img_feats_per_point.append(mlvl_img_feats)
+
+        img_pts = torch.cat(img_feats_per_point, dim=0)
+        return img_pts
+
+    def sample_single(self, img_feats, pts, img_meta):
+        """Sample features from single level image feature map.
+
+        Args:
+            img_feats (torch.Tensor): Image feature map in shape
+                (1, C, H, W).
+            pts (torch.Tensor): Points of a single sample.
+            img_meta (dict): Meta information of the single sample.
+
+        Returns:
+            torch.Tensor: Single level image features of each point.
+        """
+        # TODO: image transformation also extracted
+        img_scale_factor = (
+            pts.new_tensor(img_meta['scale_factor'][:2])
+            if 'scale_factor' in img_meta.keys() else 1)
+        img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False
+        img_crop_offset = (
+            pts.new_tensor(img_meta['img_crop_offset'])
+            if 'img_crop_offset' in img_meta.keys() else 0)
+        proj_mat = get_proj_mat_by_coord_type(img_meta, self.coord_type)
+        img_pts = point_sample(
+            img_meta=img_meta,
+            img_features=img_feats,
+            points=pts,
+            proj_mat=pts.new_tensor(proj_mat),
+            coord_type=self.coord_type,
+            img_scale_factor=img_scale_factor,
+            img_crop_offset=img_crop_offset,
+            img_flip=img_flip,
+            img_pad_shape=img_meta['input_shape'][:2],
+            img_shape=img_meta['img_shape'][:2],
+            aligned=self.aligned,
+            padding_mode=self.padding_mode,
+            align_corners=self.align_corners,
+        )
+        return img_pts
diff --git a/mmdet3d/models/fusion_layers/vote_fusion.py b/mmdet3d/models/fusion_layers/vote_fusion.py
index 3633e4d..35c268c 100644
--- a/mmdet3d/models/fusion_layers/vote_fusion.py
+++ b/mmdet3d/models/fusion_layers/vote_fusion.py
@@ -1,200 +1,200 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn as nn
-
-from mmdet3d.core.bbox import points_cam2img
-from ..builder import FUSION_LAYERS
-from . import apply_3d_transformation, bbox_2d_transform, coord_2d_transform
-
-EPS = 1e-6
-
-
-@FUSION_LAYERS.register_module()
-class VoteFusion(nn.Module):
-    """Fuse 2d features from 3d seeds.
-
-    Args:
-        num_classes (int): number of classes.
-        max_imvote_per_pixel (int): max number of imvotes.
-    """
-
-    def __init__(self, num_classes=10, max_imvote_per_pixel=3):
-        super(VoteFusion, self).__init__()
-        self.num_classes = num_classes
-        self.max_imvote_per_pixel = max_imvote_per_pixel
-
-    def forward(self, imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas):
-        """Forward function.
-
-        Args:
-            imgs (list[torch.Tensor]): Image features.
-            bboxes_2d_rescaled (list[torch.Tensor]): 2D bboxes.
-            seeds_3d_depth (torch.Tensor): 3D seeds.
-            img_metas (list[dict]): Meta information of images.
-
-        Returns:
-            torch.Tensor: Concatenated cues of each point.
-            torch.Tensor: Validity mask of each feature.
-        """
-        img_features = []
-        masks = []
-        for i, data in enumerate(
-                zip(imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas)):
-            img, bbox_2d_rescaled, seed_3d_depth, img_meta = data
-            bbox_num = bbox_2d_rescaled.shape[0]
-            seed_num = seed_3d_depth.shape[0]
-
-            img_shape = img_meta['img_shape']
-            img_h, img_w, _ = img_shape
-
-            # first reverse the data transformations
-            xyz_depth = apply_3d_transformation(
-                seed_3d_depth, 'DEPTH', img_meta, reverse=True)
-
-            # project points from depth to image
-            depth2img = xyz_depth.new_tensor(img_meta['depth2img'])
-            uvz_origin = points_cam2img(xyz_depth, depth2img, True)
-            z_cam = uvz_origin[..., 2]
-            uv_origin = (uvz_origin[..., :2] - 1).round()
-
-            # rescale 2d coordinates and bboxes
-            uv_rescaled = coord_2d_transform(img_meta, uv_origin, True)
-            bbox_2d_origin = bbox_2d_transform(img_meta, bbox_2d_rescaled,
-                                               False)
-
-            if bbox_num == 0:
-                imvote_num = seed_num * self.max_imvote_per_pixel
-
-                # use zero features
-                two_cues = torch.zeros((15, imvote_num),
-                                       device=seed_3d_depth.device)
-                mask_zero = torch.zeros(
-                    imvote_num - seed_num, device=seed_3d_depth.device).bool()
-                mask_one = torch.ones(
-                    seed_num, device=seed_3d_depth.device).bool()
-                mask = torch.cat([mask_one, mask_zero], dim=0)
-            else:
-                # expand bboxes and seeds
-                bbox_expanded = bbox_2d_origin.view(1, bbox_num, -1).expand(
-                    seed_num, -1, -1)
-                seed_2d_expanded = uv_origin.view(seed_num, 1,
-                                                  -1).expand(-1, bbox_num, -1)
-                seed_2d_expanded_x, seed_2d_expanded_y = \
-                    seed_2d_expanded.split(1, dim=-1)
-
-                bbox_expanded_l, bbox_expanded_t, bbox_expanded_r, \
-                    bbox_expanded_b, bbox_expanded_conf, bbox_expanded_cls = \
-                    bbox_expanded.split(1, dim=-1)
-                bbox_expanded_midx = (bbox_expanded_l + bbox_expanded_r) / 2
-                bbox_expanded_midy = (bbox_expanded_t + bbox_expanded_b) / 2
-
-                seed_2d_in_bbox_x = (seed_2d_expanded_x > bbox_expanded_l) * \
-                    (seed_2d_expanded_x < bbox_expanded_r)
-                seed_2d_in_bbox_y = (seed_2d_expanded_y > bbox_expanded_t) * \
-                    (seed_2d_expanded_y < bbox_expanded_b)
-                seed_2d_in_bbox = seed_2d_in_bbox_x * seed_2d_in_bbox_y
-
-                # semantic cues, dim=class_num
-                sem_cue = torch.zeros_like(bbox_expanded_conf).expand(
-                    -1, -1, self.num_classes)
-                sem_cue = sem_cue.scatter(-1, bbox_expanded_cls.long(),
-                                          bbox_expanded_conf)
-
-                # bbox center - uv
-                delta_u = bbox_expanded_midx - seed_2d_expanded_x
-                delta_v = bbox_expanded_midy - seed_2d_expanded_y
-
-                seed_3d_expanded = seed_3d_depth.view(seed_num, 1, -1).expand(
-                    -1, bbox_num, -1)
-
-                z_cam = z_cam.view(seed_num, 1, 1).expand(-1, bbox_num, -1)
-                imvote = torch.cat(
-                    [delta_u, delta_v,
-                     torch.zeros_like(delta_v)], dim=-1).view(-1, 3)
-                imvote = imvote * z_cam.reshape(-1, 1)
-                imvote = imvote @ torch.inverse(depth2img.t())
-
-                # apply transformation to lifted imvotes
-                imvote = apply_3d_transformation(
-                    imvote, 'DEPTH', img_meta, reverse=False)
-
-                seed_3d_expanded = seed_3d_expanded.reshape(imvote.shape)
-
-                # ray angle
-                ray_angle = seed_3d_expanded + imvote
-                ray_angle /= torch.sqrt(torch.sum(ray_angle**2, -1) +
-                                        EPS).unsqueeze(-1)
-
-                # imvote lifted to 3d
-                xz = ray_angle[:, [0, 2]] / (ray_angle[:, [1]] + EPS) \
-                    * seed_3d_expanded[:, [1]] - seed_3d_expanded[:, [0, 2]]
-
-                # geometric cues, dim=5
-                geo_cue = torch.cat([xz, ray_angle],
-                                    dim=-1).view(seed_num, -1, 5)
-
-                two_cues = torch.cat([geo_cue, sem_cue], dim=-1)
-                # mask to 0 if seed not in bbox
-                two_cues = two_cues * seed_2d_in_bbox.float()
-
-                feature_size = two_cues.shape[-1]
-                # if bbox number is too small, append zeros
-                if bbox_num < self.max_imvote_per_pixel:
-                    append_num = self.max_imvote_per_pixel - bbox_num
-                    append_zeros = torch.zeros(
-                        (seed_num, append_num, 1),
-                        device=seed_2d_in_bbox.device).bool()
-                    seed_2d_in_bbox = torch.cat(
-                        [seed_2d_in_bbox, append_zeros], dim=1)
-                    append_zeros = torch.zeros(
-                        (seed_num, append_num, feature_size),
-                        device=two_cues.device)
-                    two_cues = torch.cat([two_cues, append_zeros], dim=1)
-                    append_zeros = torch.zeros((seed_num, append_num, 1),
-                                               device=two_cues.device)
-                    bbox_expanded_conf = torch.cat(
-                        [bbox_expanded_conf, append_zeros], dim=1)
-
-                # sort the valid seed-bbox pair according to confidence
-                pair_score = seed_2d_in_bbox.float() + bbox_expanded_conf
-                # and find the largests
-                mask, indices = pair_score.topk(
-                    self.max_imvote_per_pixel,
-                    dim=1,
-                    largest=True,
-                    sorted=True)
-
-                indices_img = indices.expand(-1, -1, feature_size)
-                two_cues = two_cues.gather(dim=1, index=indices_img)
-                two_cues = two_cues.transpose(1, 0)
-                two_cues = two_cues.reshape(-1, feature_size).transpose(
-                    1, 0).contiguous()
-
-                # since conf is ~ (0, 1), floor gives us validity
-                mask = mask.floor().int()
-                mask = mask.transpose(1, 0).reshape(-1).bool()
-
-            # clear the padding
-            img = img[:, :img_shape[0], :img_shape[1]]
-            img_flatten = img.reshape(3, -1).float()
-            img_flatten /= 255.
-
-            # take the normalized pixel value as texture cue
-            uv_rescaled[:, 0] = torch.clamp(uv_rescaled[:, 0].round(), 0,
-                                            img_shape[1] - 1)
-            uv_rescaled[:, 1] = torch.clamp(uv_rescaled[:, 1].round(), 0,
-                                            img_shape[0] - 1)
-            uv_flatten = uv_rescaled[:, 1].round() * \
-                img_shape[1] + uv_rescaled[:, 0].round()
-            uv_expanded = uv_flatten.unsqueeze(0).expand(3, -1).long()
-            txt_cue = torch.gather(img_flatten, dim=-1, index=uv_expanded)
-            txt_cue = txt_cue.unsqueeze(1).expand(-1,
-                                                  self.max_imvote_per_pixel,
-                                                  -1).reshape(3, -1)
-
-            # append texture cue
-            img_feature = torch.cat([two_cues, txt_cue], dim=0)
-            img_features.append(img_feature)
-            masks.append(mask)
-
-        return torch.stack(img_features, 0), torch.stack(masks, 0)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn as nn
+
+from mmdet3d.core.bbox import points_cam2img
+from ..builder import FUSION_LAYERS
+from . import apply_3d_transformation, bbox_2d_transform, coord_2d_transform
+
+EPS = 1e-6
+
+
+@FUSION_LAYERS.register_module()
+class VoteFusion(nn.Module):
+    """Fuse 2d features from 3d seeds.
+
+    Args:
+        num_classes (int): number of classes.
+        max_imvote_per_pixel (int): max number of imvotes.
+    """
+
+    def __init__(self, num_classes=10, max_imvote_per_pixel=3):
+        super(VoteFusion, self).__init__()
+        self.num_classes = num_classes
+        self.max_imvote_per_pixel = max_imvote_per_pixel
+
+    def forward(self, imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas):
+        """Forward function.
+
+        Args:
+            imgs (list[torch.Tensor]): Image features.
+            bboxes_2d_rescaled (list[torch.Tensor]): 2D bboxes.
+            seeds_3d_depth (torch.Tensor): 3D seeds.
+            img_metas (list[dict]): Meta information of images.
+
+        Returns:
+            torch.Tensor: Concatenated cues of each point.
+            torch.Tensor: Validity mask of each feature.
+        """
+        img_features = []
+        masks = []
+        for i, data in enumerate(
+                zip(imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas)):
+            img, bbox_2d_rescaled, seed_3d_depth, img_meta = data
+            bbox_num = bbox_2d_rescaled.shape[0]
+            seed_num = seed_3d_depth.shape[0]
+
+            img_shape = img_meta['img_shape']
+            img_h, img_w, _ = img_shape
+
+            # first reverse the data transformations
+            xyz_depth = apply_3d_transformation(
+                seed_3d_depth, 'DEPTH', img_meta, reverse=True)
+
+            # project points from depth to image
+            depth2img = xyz_depth.new_tensor(img_meta['depth2img'])
+            uvz_origin = points_cam2img(xyz_depth, depth2img, True)
+            z_cam = uvz_origin[..., 2]
+            uv_origin = (uvz_origin[..., :2] - 1).round()
+
+            # rescale 2d coordinates and bboxes
+            uv_rescaled = coord_2d_transform(img_meta, uv_origin, True)
+            bbox_2d_origin = bbox_2d_transform(img_meta, bbox_2d_rescaled,
+                                               False)
+
+            if bbox_num == 0:
+                imvote_num = seed_num * self.max_imvote_per_pixel
+
+                # use zero features
+                two_cues = torch.zeros((15, imvote_num),
+                                       device=seed_3d_depth.device)
+                mask_zero = torch.zeros(
+                    imvote_num - seed_num, device=seed_3d_depth.device).bool()
+                mask_one = torch.ones(
+                    seed_num, device=seed_3d_depth.device).bool()
+                mask = torch.cat([mask_one, mask_zero], dim=0)
+            else:
+                # expand bboxes and seeds
+                bbox_expanded = bbox_2d_origin.view(1, bbox_num, -1).expand(
+                    seed_num, -1, -1)
+                seed_2d_expanded = uv_origin.view(seed_num, 1,
+                                                  -1).expand(-1, bbox_num, -1)
+                seed_2d_expanded_x, seed_2d_expanded_y = \
+                    seed_2d_expanded.split(1, dim=-1)
+
+                bbox_expanded_l, bbox_expanded_t, bbox_expanded_r, \
+                    bbox_expanded_b, bbox_expanded_conf, bbox_expanded_cls = \
+                    bbox_expanded.split(1, dim=-1)
+                bbox_expanded_midx = (bbox_expanded_l + bbox_expanded_r) / 2
+                bbox_expanded_midy = (bbox_expanded_t + bbox_expanded_b) / 2
+
+                seed_2d_in_bbox_x = (seed_2d_expanded_x > bbox_expanded_l) * \
+                    (seed_2d_expanded_x < bbox_expanded_r)
+                seed_2d_in_bbox_y = (seed_2d_expanded_y > bbox_expanded_t) * \
+                    (seed_2d_expanded_y < bbox_expanded_b)
+                seed_2d_in_bbox = seed_2d_in_bbox_x * seed_2d_in_bbox_y
+
+                # semantic cues, dim=class_num
+                sem_cue = torch.zeros_like(bbox_expanded_conf).expand(
+                    -1, -1, self.num_classes)
+                sem_cue = sem_cue.scatter(-1, bbox_expanded_cls.long(),
+                                          bbox_expanded_conf)
+
+                # bbox center - uv
+                delta_u = bbox_expanded_midx - seed_2d_expanded_x
+                delta_v = bbox_expanded_midy - seed_2d_expanded_y
+
+                seed_3d_expanded = seed_3d_depth.view(seed_num, 1, -1).expand(
+                    -1, bbox_num, -1)
+
+                z_cam = z_cam.view(seed_num, 1, 1).expand(-1, bbox_num, -1)
+                imvote = torch.cat(
+                    [delta_u, delta_v,
+                     torch.zeros_like(delta_v)], dim=-1).view(-1, 3)
+                imvote = imvote * z_cam.reshape(-1, 1)
+                imvote = imvote @ torch.inverse(depth2img.t())
+
+                # apply transformation to lifted imvotes
+                imvote = apply_3d_transformation(
+                    imvote, 'DEPTH', img_meta, reverse=False)
+
+                seed_3d_expanded = seed_3d_expanded.reshape(imvote.shape)
+
+                # ray angle
+                ray_angle = seed_3d_expanded + imvote
+                ray_angle /= torch.sqrt(torch.sum(ray_angle**2, -1) +
+                                        EPS).unsqueeze(-1)
+
+                # imvote lifted to 3d
+                xz = ray_angle[:, [0, 2]] / (ray_angle[:, [1]] + EPS) \
+                    * seed_3d_expanded[:, [1]] - seed_3d_expanded[:, [0, 2]]
+
+                # geometric cues, dim=5
+                geo_cue = torch.cat([xz, ray_angle],
+                                    dim=-1).view(seed_num, -1, 5)
+
+                two_cues = torch.cat([geo_cue, sem_cue], dim=-1)
+                # mask to 0 if seed not in bbox
+                two_cues = two_cues * seed_2d_in_bbox.float()
+
+                feature_size = two_cues.shape[-1]
+                # if bbox number is too small, append zeros
+                if bbox_num < self.max_imvote_per_pixel:
+                    append_num = self.max_imvote_per_pixel - bbox_num
+                    append_zeros = torch.zeros(
+                        (seed_num, append_num, 1),
+                        device=seed_2d_in_bbox.device).bool()
+                    seed_2d_in_bbox = torch.cat(
+                        [seed_2d_in_bbox, append_zeros], dim=1)
+                    append_zeros = torch.zeros(
+                        (seed_num, append_num, feature_size),
+                        device=two_cues.device)
+                    two_cues = torch.cat([two_cues, append_zeros], dim=1)
+                    append_zeros = torch.zeros((seed_num, append_num, 1),
+                                               device=two_cues.device)
+                    bbox_expanded_conf = torch.cat(
+                        [bbox_expanded_conf, append_zeros], dim=1)
+
+                # sort the valid seed-bbox pair according to confidence
+                pair_score = seed_2d_in_bbox.float() + bbox_expanded_conf
+                # and find the largests
+                mask, indices = pair_score.topk(
+                    self.max_imvote_per_pixel,
+                    dim=1,
+                    largest=True,
+                    sorted=True)
+
+                indices_img = indices.expand(-1, -1, feature_size)
+                two_cues = two_cues.gather(dim=1, index=indices_img)
+                two_cues = two_cues.transpose(1, 0)
+                two_cues = two_cues.reshape(-1, feature_size).transpose(
+                    1, 0).contiguous()
+
+                # since conf is ~ (0, 1), floor gives us validity
+                mask = mask.floor().int()
+                mask = mask.transpose(1, 0).reshape(-1).bool()
+
+            # clear the padding
+            img = img[:, :img_shape[0], :img_shape[1]]
+            img_flatten = img.reshape(3, -1).float()
+            img_flatten /= 255.
+
+            # take the normalized pixel value as texture cue
+            uv_rescaled[:, 0] = torch.clamp(uv_rescaled[:, 0].round(), 0,
+                                            img_shape[1] - 1)
+            uv_rescaled[:, 1] = torch.clamp(uv_rescaled[:, 1].round(), 0,
+                                            img_shape[0] - 1)
+            uv_flatten = uv_rescaled[:, 1].round() * \
+                img_shape[1] + uv_rescaled[:, 0].round()
+            uv_expanded = uv_flatten.unsqueeze(0).expand(3, -1).long()
+            txt_cue = torch.gather(img_flatten, dim=-1, index=uv_expanded)
+            txt_cue = txt_cue.unsqueeze(1).expand(-1,
+                                                  self.max_imvote_per_pixel,
+                                                  -1).reshape(3, -1)
+
+            # append texture cue
+            img_feature = torch.cat([two_cues, txt_cue], dim=0)
+            img_features.append(img_feature)
+            masks.append(mask)
+
+        return torch.stack(img_features, 0), torch.stack(masks, 0)
diff --git a/mmdet3d/models/losses/__init__.py b/mmdet3d/models/losses/__init__.py
index dcdc69a..0b261b3 100644
--- a/mmdet3d/models/losses/__init__.py
+++ b/mmdet3d/models/losses/__init__.py
@@ -1,14 +1,14 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.models.losses import FocalLoss, SmoothL1Loss, binary_cross_entropy
-from .axis_aligned_iou_loss import AxisAlignedIoULoss, axis_aligned_iou_loss
-from .chamfer_distance import ChamferDistance, chamfer_distance
-from .multibin_loss import MultiBinLoss
-from .paconv_regularization_loss import PAConvRegularizationLoss
-from .uncertain_smooth_l1_loss import UncertainL1Loss, UncertainSmoothL1Loss
-
-__all__ = [
-    'FocalLoss', 'SmoothL1Loss', 'binary_cross_entropy', 'ChamferDistance',
-    'chamfer_distance', 'axis_aligned_iou_loss', 'AxisAlignedIoULoss',
-    'PAConvRegularizationLoss', 'UncertainL1Loss', 'UncertainSmoothL1Loss',
-    'MultiBinLoss'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.losses import FocalLoss, SmoothL1Loss, binary_cross_entropy
+from .axis_aligned_iou_loss import AxisAlignedIoULoss, axis_aligned_iou_loss
+from .chamfer_distance import ChamferDistance, chamfer_distance
+from .multibin_loss import MultiBinLoss
+from .paconv_regularization_loss import PAConvRegularizationLoss
+from .uncertain_smooth_l1_loss import UncertainL1Loss, UncertainSmoothL1Loss
+
+__all__ = [
+    'FocalLoss', 'SmoothL1Loss', 'binary_cross_entropy', 'ChamferDistance',
+    'chamfer_distance', 'axis_aligned_iou_loss', 'AxisAlignedIoULoss',
+    'PAConvRegularizationLoss', 'UncertainL1Loss', 'UncertainSmoothL1Loss',
+    'MultiBinLoss'
+]
diff --git a/mmdet3d/models/losses/axis_aligned_iou_loss.py b/mmdet3d/models/losses/axis_aligned_iou_loss.py
index 428d7bb..1d0db0f 100644
--- a/mmdet3d/models/losses/axis_aligned_iou_loss.py
+++ b/mmdet3d/models/losses/axis_aligned_iou_loss.py
@@ -1,79 +1,79 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn as nn
-
-from mmdet.models.losses.utils import weighted_loss
-from ...core.bbox import AxisAlignedBboxOverlaps3D
-from ..builder import LOSSES
-
-
-@weighted_loss
-def axis_aligned_iou_loss(pred, target):
-    """Calculate the IoU loss (1-IoU) of two set of axis aligned bounding
-    boxes. Note that predictions and targets are one-to-one corresponded.
-
-    Args:
-        pred (torch.Tensor): Bbox predictions with shape [..., 3].
-        target (torch.Tensor): Bbox targets (gt) with shape [..., 3].
-
-    Returns:
-        torch.Tensor: IoU loss between predictions and targets.
-    """
-
-    axis_aligned_iou = AxisAlignedBboxOverlaps3D()(
-        pred, target, is_aligned=True)
-    iou_loss = 1 - axis_aligned_iou
-    return iou_loss
-
-
-@LOSSES.register_module()
-class AxisAlignedIoULoss(nn.Module):
-    """Calculate the IoU loss (1-IoU) of axis aligned bounding boxes.
-
-    Args:
-        reduction (str): Method to reduce losses.
-            The valid reduction method are none, sum or mean.
-        loss_weight (float, optional): Weight of loss. Defaults to 1.0.
-    """
-
-    def __init__(self, reduction='mean', loss_weight=1.0):
-        super(AxisAlignedIoULoss, self).__init__()
-        assert reduction in ['none', 'sum', 'mean']
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self,
-                pred,
-                target,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None,
-                **kwargs):
-        """Forward function of loss calculation.
-
-        Args:
-            pred (torch.Tensor): Bbox predictions with shape [..., 3].
-            target (torch.Tensor): Bbox targets (gt) with shape [..., 3].
-            weight (torch.Tensor | float, optional): Weight of loss.
-                Defaults to None.
-            avg_factor (int, optional): Average factor that is used to average
-                the loss. Defaults to None.
-            reduction_override (str, optional): Method to reduce losses.
-                The valid reduction method are 'none', 'sum' or 'mean'.
-                Defaults to None.
-
-        Returns:
-            torch.Tensor: IoU loss between predictions and targets.
-        """
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        if (weight is not None) and (not torch.any(weight > 0)) and (
-                reduction != 'none'):
-            return (pred * weight).sum()
-        return axis_aligned_iou_loss(
-            pred,
-            target,
-            weight=weight,
-            avg_factor=avg_factor,
-            reduction=reduction) * self.loss_weight
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn as nn
+
+from mmdet.models.losses.utils import weighted_loss
+from ...core.bbox import AxisAlignedBboxOverlaps3D
+from ..builder import LOSSES
+
+
+@weighted_loss
+def axis_aligned_iou_loss(pred, target):
+    """Calculate the IoU loss (1-IoU) of two set of axis aligned bounding
+    boxes. Note that predictions and targets are one-to-one corresponded.
+
+    Args:
+        pred (torch.Tensor): Bbox predictions with shape [..., 3].
+        target (torch.Tensor): Bbox targets (gt) with shape [..., 3].
+
+    Returns:
+        torch.Tensor: IoU loss between predictions and targets.
+    """
+
+    axis_aligned_iou = AxisAlignedBboxOverlaps3D()(
+        pred, target, is_aligned=True)
+    iou_loss = 1 - axis_aligned_iou
+    return iou_loss
+
+
+@LOSSES.register_module()
+class AxisAlignedIoULoss(nn.Module):
+    """Calculate the IoU loss (1-IoU) of axis aligned bounding boxes.
+
+    Args:
+        reduction (str): Method to reduce losses.
+            The valid reduction method are none, sum or mean.
+        loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(AxisAlignedIoULoss, self).__init__()
+        assert reduction in ['none', 'sum', 'mean']
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function of loss calculation.
+
+        Args:
+            pred (torch.Tensor): Bbox predictions with shape [..., 3].
+            target (torch.Tensor): Bbox targets (gt) with shape [..., 3].
+            weight (torch.Tensor | float, optional): Weight of loss.
+                Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: IoU loss between predictions and targets.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            return (pred * weight).sum()
+        return axis_aligned_iou_loss(
+            pred,
+            target,
+            weight=weight,
+            avg_factor=avg_factor,
+            reduction=reduction) * self.loss_weight
diff --git a/mmdet3d/models/losses/chamfer_distance.py b/mmdet3d/models/losses/chamfer_distance.py
index 8ad109d..367c30a 100644
--- a/mmdet3d/models/losses/chamfer_distance.py
+++ b/mmdet3d/models/losses/chamfer_distance.py
@@ -1,147 +1,147 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn as nn
-from torch.nn.functional import l1_loss, mse_loss, smooth_l1_loss
-
-from ..builder import LOSSES
-
-
-def chamfer_distance(src,
-                     dst,
-                     src_weight=1.0,
-                     dst_weight=1.0,
-                     criterion_mode='l2',
-                     reduction='mean'):
-    """Calculate Chamfer Distance of two sets.
-
-    Args:
-        src (torch.Tensor): Source set with shape [B, N, C] to
-            calculate Chamfer Distance.
-        dst (torch.Tensor): Destination set with shape [B, M, C] to
-            calculate Chamfer Distance.
-        src_weight (torch.Tensor or float): Weight of source loss.
-        dst_weight (torch.Tensor or float): Weight of destination loss.
-        criterion_mode (str): Criterion mode to calculate distance.
-            The valid modes are smooth_l1, l1 or l2.
-        reduction (str): Method to reduce losses.
-            The valid reduction method are 'none', 'sum' or 'mean'.
-
-    Returns:
-        tuple: Source and Destination loss with the corresponding indices.
-
-            - loss_src (torch.Tensor): The min distance
-                from source to destination.
-            - loss_dst (torch.Tensor): The min distance
-                from destination to source.
-            - indices1 (torch.Tensor): Index the min distance point
-                for each point in source to destination.
-            - indices2 (torch.Tensor): Index the min distance point
-                for each point in destination to source.
-    """
-
-    if criterion_mode == 'smooth_l1':
-        criterion = smooth_l1_loss
-    elif criterion_mode == 'l1':
-        criterion = l1_loss
-    elif criterion_mode == 'l2':
-        criterion = mse_loss
-    else:
-        raise NotImplementedError
-
-    src_expand = src.unsqueeze(2).repeat(1, 1, dst.shape[1], 1)
-    dst_expand = dst.unsqueeze(1).repeat(1, src.shape[1], 1, 1)
-
-    distance = criterion(src_expand, dst_expand, reduction='none').sum(-1)
-    src2dst_distance, indices1 = torch.min(distance, dim=2)  # (B,N)
-    dst2src_distance, indices2 = torch.min(distance, dim=1)  # (B,M)
-
-    loss_src = (src2dst_distance * src_weight)
-    loss_dst = (dst2src_distance * dst_weight)
-
-    if reduction == 'sum':
-        loss_src = torch.sum(loss_src)
-        loss_dst = torch.sum(loss_dst)
-    elif reduction == 'mean':
-        loss_src = torch.mean(loss_src)
-        loss_dst = torch.mean(loss_dst)
-    elif reduction == 'none':
-        pass
-    else:
-        raise NotImplementedError
-
-    return loss_src, loss_dst, indices1, indices2
-
-
-@LOSSES.register_module()
-class ChamferDistance(nn.Module):
-    """Calculate Chamfer Distance of two sets.
-
-    Args:
-        mode (str): Criterion mode to calculate distance.
-            The valid modes are smooth_l1, l1 or l2.
-        reduction (str): Method to reduce losses.
-            The valid reduction method are none, sum or mean.
-        loss_src_weight (float): Weight of loss_source.
-        loss_dst_weight (float): Weight of loss_target.
-    """
-
-    def __init__(self,
-                 mode='l2',
-                 reduction='mean',
-                 loss_src_weight=1.0,
-                 loss_dst_weight=1.0):
-        super(ChamferDistance, self).__init__()
-
-        assert mode in ['smooth_l1', 'l1', 'l2']
-        assert reduction in ['none', 'sum', 'mean']
-        self.mode = mode
-        self.reduction = reduction
-        self.loss_src_weight = loss_src_weight
-        self.loss_dst_weight = loss_dst_weight
-
-    def forward(self,
-                source,
-                target,
-                src_weight=1.0,
-                dst_weight=1.0,
-                reduction_override=None,
-                return_indices=False,
-                **kwargs):
-        """Forward function of loss calculation.
-
-        Args:
-            source (torch.Tensor): Source set with shape [B, N, C] to
-                calculate Chamfer Distance.
-            target (torch.Tensor): Destination set with shape [B, M, C] to
-                calculate Chamfer Distance.
-            src_weight (torch.Tensor | float, optional):
-                Weight of source loss. Defaults to 1.0.
-            dst_weight (torch.Tensor | float, optional):
-                Weight of destination loss. Defaults to 1.0.
-            reduction_override (str, optional): Method to reduce losses.
-                The valid reduction method are 'none', 'sum' or 'mean'.
-                Defaults to None.
-            return_indices (bool, optional): Whether to return indices.
-                Defaults to False.
-
-        Returns:
-            tuple[torch.Tensor]: If ``return_indices=True``, return losses of
-                source and target with their corresponding indices in the
-                order of ``(loss_source, loss_target, indices1, indices2)``.
-                If ``return_indices=False``, return
-                ``(loss_source, loss_target)``.
-        """
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-
-        loss_source, loss_target, indices1, indices2 = chamfer_distance(
-            source, target, src_weight, dst_weight, self.mode, reduction)
-
-        loss_source *= self.loss_src_weight
-        loss_target *= self.loss_dst_weight
-
-        if return_indices:
-            return loss_source, loss_target, indices1, indices2
-        else:
-            return loss_source, loss_target
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn as nn
+from torch.nn.functional import l1_loss, mse_loss, smooth_l1_loss
+
+from ..builder import LOSSES
+
+
+def chamfer_distance(src,
+                     dst,
+                     src_weight=1.0,
+                     dst_weight=1.0,
+                     criterion_mode='l2',
+                     reduction='mean'):
+    """Calculate Chamfer Distance of two sets.
+
+    Args:
+        src (torch.Tensor): Source set with shape [B, N, C] to
+            calculate Chamfer Distance.
+        dst (torch.Tensor): Destination set with shape [B, M, C] to
+            calculate Chamfer Distance.
+        src_weight (torch.Tensor or float): Weight of source loss.
+        dst_weight (torch.Tensor or float): Weight of destination loss.
+        criterion_mode (str): Criterion mode to calculate distance.
+            The valid modes are smooth_l1, l1 or l2.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+
+    Returns:
+        tuple: Source and Destination loss with the corresponding indices.
+
+            - loss_src (torch.Tensor): The min distance
+                from source to destination.
+            - loss_dst (torch.Tensor): The min distance
+                from destination to source.
+            - indices1 (torch.Tensor): Index the min distance point
+                for each point in source to destination.
+            - indices2 (torch.Tensor): Index the min distance point
+                for each point in destination to source.
+    """
+
+    if criterion_mode == 'smooth_l1':
+        criterion = smooth_l1_loss
+    elif criterion_mode == 'l1':
+        criterion = l1_loss
+    elif criterion_mode == 'l2':
+        criterion = mse_loss
+    else:
+        raise NotImplementedError
+
+    src_expand = src.unsqueeze(2).repeat(1, 1, dst.shape[1], 1)
+    dst_expand = dst.unsqueeze(1).repeat(1, src.shape[1], 1, 1)
+
+    distance = criterion(src_expand, dst_expand, reduction='none').sum(-1)
+    src2dst_distance, indices1 = torch.min(distance, dim=2)  # (B,N)
+    dst2src_distance, indices2 = torch.min(distance, dim=1)  # (B,M)
+
+    loss_src = (src2dst_distance * src_weight)
+    loss_dst = (dst2src_distance * dst_weight)
+
+    if reduction == 'sum':
+        loss_src = torch.sum(loss_src)
+        loss_dst = torch.sum(loss_dst)
+    elif reduction == 'mean':
+        loss_src = torch.mean(loss_src)
+        loss_dst = torch.mean(loss_dst)
+    elif reduction == 'none':
+        pass
+    else:
+        raise NotImplementedError
+
+    return loss_src, loss_dst, indices1, indices2
+
+
+@LOSSES.register_module()
+class ChamferDistance(nn.Module):
+    """Calculate Chamfer Distance of two sets.
+
+    Args:
+        mode (str): Criterion mode to calculate distance.
+            The valid modes are smooth_l1, l1 or l2.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are none, sum or mean.
+        loss_src_weight (float): Weight of loss_source.
+        loss_dst_weight (float): Weight of loss_target.
+    """
+
+    def __init__(self,
+                 mode='l2',
+                 reduction='mean',
+                 loss_src_weight=1.0,
+                 loss_dst_weight=1.0):
+        super(ChamferDistance, self).__init__()
+
+        assert mode in ['smooth_l1', 'l1', 'l2']
+        assert reduction in ['none', 'sum', 'mean']
+        self.mode = mode
+        self.reduction = reduction
+        self.loss_src_weight = loss_src_weight
+        self.loss_dst_weight = loss_dst_weight
+
+    def forward(self,
+                source,
+                target,
+                src_weight=1.0,
+                dst_weight=1.0,
+                reduction_override=None,
+                return_indices=False,
+                **kwargs):
+        """Forward function of loss calculation.
+
+        Args:
+            source (torch.Tensor): Source set with shape [B, N, C] to
+                calculate Chamfer Distance.
+            target (torch.Tensor): Destination set with shape [B, M, C] to
+                calculate Chamfer Distance.
+            src_weight (torch.Tensor | float, optional):
+                Weight of source loss. Defaults to 1.0.
+            dst_weight (torch.Tensor | float, optional):
+                Weight of destination loss. Defaults to 1.0.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+            return_indices (bool, optional): Whether to return indices.
+                Defaults to False.
+
+        Returns:
+            tuple[torch.Tensor]: If ``return_indices=True``, return losses of
+                source and target with their corresponding indices in the
+                order of ``(loss_source, loss_target, indices1, indices2)``.
+                If ``return_indices=False``, return
+                ``(loss_source, loss_target)``.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss_source, loss_target, indices1, indices2 = chamfer_distance(
+            source, target, src_weight, dst_weight, self.mode, reduction)
+
+        loss_source *= self.loss_src_weight
+        loss_target *= self.loss_dst_weight
+
+        if return_indices:
+            return loss_source, loss_target, indices1, indices2
+        else:
+            return loss_source, loss_target
diff --git a/mmdet3d/models/losses/multibin_loss.py b/mmdet3d/models/losses/multibin_loss.py
index 461a19c..43d9b0f 100644
--- a/mmdet3d/models/losses/multibin_loss.py
+++ b/mmdet3d/models/losses/multibin_loss.py
@@ -1,93 +1,93 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-
-from mmdet.models.losses.utils import weighted_loss
-from ..builder import LOSSES
-
-
-@weighted_loss
-def multibin_loss(pred_orientations, gt_orientations, num_dir_bins=4):
-    """Multi-Bin Loss.
-
-    Args:
-        pred_orientations(torch.Tensor): Predicted local vector
-            orientation in [axis_cls, head_cls, sin, cos] format.
-            shape (N, num_dir_bins * 4)
-        gt_orientations(torch.Tensor): Corresponding gt bboxes,
-            shape (N, num_dir_bins * 2).
-        num_dir_bins(int, optional): Number of bins to encode
-            direction angle.
-            Defaults: 4.
-
-    Return:
-        torch.Tensor: Loss tensor.
-    """
-    cls_losses = 0
-    reg_losses = 0
-    reg_cnt = 0
-    for i in range(num_dir_bins):
-        # bin cls loss
-        cls_ce_loss = F.cross_entropy(
-            pred_orientations[:, (i * 2):(i * 2 + 2)],
-            gt_orientations[:, i].long(),
-            reduction='mean')
-        # regression loss
-        valid_mask_i = (gt_orientations[:, i] == 1)
-        cls_losses += cls_ce_loss
-        if valid_mask_i.sum() > 0:
-            start = num_dir_bins * 2 + i * 2
-            end = start + 2
-            pred_offset = F.normalize(pred_orientations[valid_mask_i,
-                                                        start:end])
-            gt_offset_sin = torch.sin(gt_orientations[valid_mask_i,
-                                                      num_dir_bins + i])
-            gt_offset_cos = torch.cos(gt_orientations[valid_mask_i,
-                                                      num_dir_bins + i])
-            reg_loss = \
-                F.l1_loss(pred_offset[:, 0], gt_offset_sin,
-                          reduction='none') + \
-                F.l1_loss(pred_offset[:, 1], gt_offset_cos,
-                          reduction='none')
-
-            reg_losses += reg_loss.sum()
-            reg_cnt += valid_mask_i.sum()
-
-        return cls_losses / num_dir_bins + reg_losses / reg_cnt
-
-
-@LOSSES.register_module()
-class MultiBinLoss(nn.Module):
-    """Multi-Bin Loss for orientation.
-
-    Args:
-        reduction (str, optional): The method to reduce the loss.
-            Options are 'none', 'mean' and 'sum'. Defaults to 'none'.
-        loss_weight (float, optional): The weight of loss. Defaults
-            to 1.0.
-    """
-
-    def __init__(self, reduction='none', loss_weight=1.0):
-        super(MultiBinLoss, self).__init__()
-        assert reduction in ['none', 'sum', 'mean']
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self, pred, target, num_dir_bins, reduction_override=None):
-        """Forward function.
-
-        Args:
-            pred (torch.Tensor): The prediction.
-            target (torch.Tensor): The learning target of the prediction.
-            num_dir_bins (int): Number of bins to encode direction angle.
-            reduction_override (str, optional): The reduction method used to
-                override the original reduction method of the loss.
-                Defaults to None.
-        """
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        loss = self.loss_weight * multibin_loss(
-            pred, target, num_dir_bins=num_dir_bins, reduction=reduction)
-        return loss
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet.models.losses.utils import weighted_loss
+from ..builder import LOSSES
+
+
+@weighted_loss
+def multibin_loss(pred_orientations, gt_orientations, num_dir_bins=4):
+    """Multi-Bin Loss.
+
+    Args:
+        pred_orientations(torch.Tensor): Predicted local vector
+            orientation in [axis_cls, head_cls, sin, cos] format.
+            shape (N, num_dir_bins * 4)
+        gt_orientations(torch.Tensor): Corresponding gt bboxes,
+            shape (N, num_dir_bins * 2).
+        num_dir_bins(int, optional): Number of bins to encode
+            direction angle.
+            Defaults: 4.
+
+    Return:
+        torch.Tensor: Loss tensor.
+    """
+    cls_losses = 0
+    reg_losses = 0
+    reg_cnt = 0
+    for i in range(num_dir_bins):
+        # bin cls loss
+        cls_ce_loss = F.cross_entropy(
+            pred_orientations[:, (i * 2):(i * 2 + 2)],
+            gt_orientations[:, i].long(),
+            reduction='mean')
+        # regression loss
+        valid_mask_i = (gt_orientations[:, i] == 1)
+        cls_losses += cls_ce_loss
+        if valid_mask_i.sum() > 0:
+            start = num_dir_bins * 2 + i * 2
+            end = start + 2
+            pred_offset = F.normalize(pred_orientations[valid_mask_i,
+                                                        start:end])
+            gt_offset_sin = torch.sin(gt_orientations[valid_mask_i,
+                                                      num_dir_bins + i])
+            gt_offset_cos = torch.cos(gt_orientations[valid_mask_i,
+                                                      num_dir_bins + i])
+            reg_loss = \
+                F.l1_loss(pred_offset[:, 0], gt_offset_sin,
+                          reduction='none') + \
+                F.l1_loss(pred_offset[:, 1], gt_offset_cos,
+                          reduction='none')
+
+            reg_losses += reg_loss.sum()
+            reg_cnt += valid_mask_i.sum()
+
+        return cls_losses / num_dir_bins + reg_losses / reg_cnt
+
+
+@LOSSES.register_module()
+class MultiBinLoss(nn.Module):
+    """Multi-Bin Loss for orientation.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are 'none', 'mean' and 'sum'. Defaults to 'none'.
+        loss_weight (float, optional): The weight of loss. Defaults
+            to 1.0.
+    """
+
+    def __init__(self, reduction='none', loss_weight=1.0):
+        super(MultiBinLoss, self).__init__()
+        assert reduction in ['none', 'sum', 'mean']
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, num_dir_bins, reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            num_dir_bins (int): Number of bins to encode direction angle.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * multibin_loss(
+            pred, target, num_dir_bins=num_dir_bins, reduction=reduction)
+        return loss
diff --git a/mmdet3d/models/losses/paconv_regularization_loss.py b/mmdet3d/models/losses/paconv_regularization_loss.py
index 2001790..2eba652 100644
--- a/mmdet3d/models/losses/paconv_regularization_loss.py
+++ b/mmdet3d/models/losses/paconv_regularization_loss.py
@@ -1,108 +1,108 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn as nn
-
-from mmdet3d.ops import PAConv, PAConvCUDA
-from mmdet.models.losses.utils import weight_reduce_loss
-from ..builder import LOSSES
-
-
-def weight_correlation(conv):
-    """Calculate correlations between kernel weights in Conv's weight bank as
-    regularization loss. The cosine similarity is used as metrics.
-
-    Args:
-        conv (nn.Module): A Conv modules to be regularized.
-            Currently we only support `PAConv` and `PAConvCUDA`.
-
-    Returns:
-        torch.Tensor: Correlations between each kernel weights in weight bank.
-    """
-    assert isinstance(conv, (PAConv, PAConvCUDA)), \
-        f'unsupported module type {type(conv)}'
-    kernels = conv.weight_bank  # [C_in, num_kernels * C_out]
-    in_channels = conv.in_channels
-    out_channels = conv.out_channels
-    num_kernels = conv.num_kernels
-
-    # [num_kernels, Cin * Cout]
-    flatten_kernels = kernels.view(in_channels, num_kernels, out_channels).\
-        permute(1, 0, 2).reshape(num_kernels, -1)
-    # [num_kernels, num_kernels]
-    inner_product = torch.matmul(flatten_kernels, flatten_kernels.T)
-    # [num_kernels, 1]
-    kernel_norms = torch.sum(flatten_kernels**2, dim=-1, keepdim=True)**0.5
-    # [num_kernels, num_kernels]
-    kernel_norms = torch.matmul(kernel_norms, kernel_norms.T)
-    cosine_sims = inner_product / kernel_norms
-    # take upper triangular part excluding diagonal since we only compute
-    # correlation between different kernels once
-    # the square is to ensure positive loss, refer to:
-    # https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/tool/train.py#L208
-    corr = torch.sum(torch.triu(cosine_sims, diagonal=1)**2)
-
-    return corr
-
-
-def paconv_regularization_loss(modules, reduction):
-    """Computes correlation loss of PAConv weight kernels as regularization.
-
-    Args:
-        modules (List[nn.Module] | :obj:`generator`):
-            A list or a python generator of torch.nn.Modules.
-        reduction (str): Method to reduce losses among PAConv modules.
-            The valid reduction method are none, sum or mean.
-
-    Returns:
-        torch.Tensor: Correlation loss of kernel weights.
-    """
-    corr_loss = []
-    for module in modules:
-        if isinstance(module, (PAConv, PAConvCUDA)):
-            corr_loss.append(weight_correlation(module))
-    corr_loss = torch.stack(corr_loss)
-
-    # perform reduction
-    corr_loss = weight_reduce_loss(corr_loss, reduction=reduction)
-
-    return corr_loss
-
-
-@LOSSES.register_module()
-class PAConvRegularizationLoss(nn.Module):
-    """Calculate correlation loss of kernel weights in PAConv's weight bank.
-
-    This is used as a regularization term in PAConv model training.
-
-    Args:
-        reduction (str): Method to reduce losses. The reduction is performed
-            among all PAConv modules instead of prediction tensors.
-            The valid reduction method are none, sum or mean.
-        loss_weight (float, optional): Weight of loss. Defaults to 1.0.
-    """
-
-    def __init__(self, reduction='mean', loss_weight=1.0):
-        super(PAConvRegularizationLoss, self).__init__()
-        assert reduction in ['none', 'sum', 'mean']
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self, modules, reduction_override=None, **kwargs):
-        """Forward function of loss calculation.
-
-        Args:
-            modules (List[nn.Module] | :obj:`generator`):
-                A list or a python generator of torch.nn.Modules.
-            reduction_override (str, optional): Method to reduce losses.
-                The valid reduction method are 'none', 'sum' or 'mean'.
-                Defaults to None.
-
-        Returns:
-            torch.Tensor: Correlation loss of kernel weights.
-        """
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-
-        return self.loss_weight * paconv_regularization_loss(
-            modules, reduction=reduction)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn as nn
+
+from mmdet3d.ops import PAConv, PAConvCUDA
+from mmdet.models.losses.utils import weight_reduce_loss
+from ..builder import LOSSES
+
+
+def weight_correlation(conv):
+    """Calculate correlations between kernel weights in Conv's weight bank as
+    regularization loss. The cosine similarity is used as metrics.
+
+    Args:
+        conv (nn.Module): A Conv modules to be regularized.
+            Currently we only support `PAConv` and `PAConvCUDA`.
+
+    Returns:
+        torch.Tensor: Correlations between each kernel weights in weight bank.
+    """
+    assert isinstance(conv, (PAConv, PAConvCUDA)), \
+        f'unsupported module type {type(conv)}'
+    kernels = conv.weight_bank  # [C_in, num_kernels * C_out]
+    in_channels = conv.in_channels
+    out_channels = conv.out_channels
+    num_kernels = conv.num_kernels
+
+    # [num_kernels, Cin * Cout]
+    flatten_kernels = kernels.view(in_channels, num_kernels, out_channels).\
+        permute(1, 0, 2).reshape(num_kernels, -1)
+    # [num_kernels, num_kernels]
+    inner_product = torch.matmul(flatten_kernels, flatten_kernels.T)
+    # [num_kernels, 1]
+    kernel_norms = torch.sum(flatten_kernels**2, dim=-1, keepdim=True)**0.5
+    # [num_kernels, num_kernels]
+    kernel_norms = torch.matmul(kernel_norms, kernel_norms.T)
+    cosine_sims = inner_product / kernel_norms
+    # take upper triangular part excluding diagonal since we only compute
+    # correlation between different kernels once
+    # the square is to ensure positive loss, refer to:
+    # https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/tool/train.py#L208
+    corr = torch.sum(torch.triu(cosine_sims, diagonal=1)**2)
+
+    return corr
+
+
+def paconv_regularization_loss(modules, reduction):
+    """Computes correlation loss of PAConv weight kernels as regularization.
+
+    Args:
+        modules (List[nn.Module] | :obj:`generator`):
+            A list or a python generator of torch.nn.Modules.
+        reduction (str): Method to reduce losses among PAConv modules.
+            The valid reduction method are none, sum or mean.
+
+    Returns:
+        torch.Tensor: Correlation loss of kernel weights.
+    """
+    corr_loss = []
+    for module in modules:
+        if isinstance(module, (PAConv, PAConvCUDA)):
+            corr_loss.append(weight_correlation(module))
+    corr_loss = torch.stack(corr_loss)
+
+    # perform reduction
+    corr_loss = weight_reduce_loss(corr_loss, reduction=reduction)
+
+    return corr_loss
+
+
+@LOSSES.register_module()
+class PAConvRegularizationLoss(nn.Module):
+    """Calculate correlation loss of kernel weights in PAConv's weight bank.
+
+    This is used as a regularization term in PAConv model training.
+
+    Args:
+        reduction (str): Method to reduce losses. The reduction is performed
+            among all PAConv modules instead of prediction tensors.
+            The valid reduction method are none, sum or mean.
+        loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(PAConvRegularizationLoss, self).__init__()
+        assert reduction in ['none', 'sum', 'mean']
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, modules, reduction_override=None, **kwargs):
+        """Forward function of loss calculation.
+
+        Args:
+            modules (List[nn.Module] | :obj:`generator`):
+                A list or a python generator of torch.nn.Modules.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: Correlation loss of kernel weights.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        return self.loss_weight * paconv_regularization_loss(
+            modules, reduction=reduction)
diff --git a/mmdet3d/models/losses/uncertain_smooth_l1_loss.py b/mmdet3d/models/losses/uncertain_smooth_l1_loss.py
index e80c08f..3ae51b4 100644
--- a/mmdet3d/models/losses/uncertain_smooth_l1_loss.py
+++ b/mmdet3d/models/losses/uncertain_smooth_l1_loss.py
@@ -1,176 +1,176 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn as nn
-
-from mmdet.models.losses.utils import weighted_loss
-from ..builder import LOSSES
-
-
-@weighted_loss
-def uncertain_smooth_l1_loss(pred, target, sigma, alpha=1.0, beta=1.0):
-    """Smooth L1 loss with uncertainty.
-
-    Args:
-        pred (torch.Tensor): The prediction.
-        target (torch.Tensor): The learning target of the prediction.
-        sigma (torch.Tensor): The sigma for uncertainty.
-        alpha (float, optional): The coefficient of log(sigma).
-            Defaults to 1.0.
-        beta (float, optional): The threshold in the piecewise function.
-            Defaults to 1.0.
-
-    Returns:
-        torch.Tensor: Calculated loss
-    """
-    assert beta > 0
-    assert target.numel() > 0
-    assert pred.size() == target.size() == sigma.size(), 'The size of pred ' \
-        f'{pred.size()}, target {target.size()}, and sigma {sigma.size()} ' \
-        'are inconsistent.'
-    diff = torch.abs(pred - target)
-    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
-                       diff - 0.5 * beta)
-    loss = torch.exp(-sigma) * loss + alpha * sigma
-
-    return loss
-
-
-@weighted_loss
-def uncertain_l1_loss(pred, target, sigma, alpha=1.0):
-    """L1 loss with uncertainty.
-
-    Args:
-        pred (torch.Tensor): The prediction.
-        target (torch.Tensor): The learning target of the prediction.
-        sigma (torch.Tensor): The sigma for uncertainty.
-        alpha (float, optional): The coefficient of log(sigma).
-            Defaults to 1.0.
-
-    Returns:
-        torch.Tensor: Calculated loss
-    """
-    assert target.numel() > 0
-    assert pred.size() == target.size() == sigma.size(), 'The size of pred ' \
-        f'{pred.size()}, target {target.size()}, and sigma {sigma.size()} ' \
-        'are inconsistent.'
-    loss = torch.abs(pred - target)
-    loss = torch.exp(-sigma) * loss + alpha * sigma
-    return loss
-
-
-@LOSSES.register_module()
-class UncertainSmoothL1Loss(nn.Module):
-    r"""Smooth L1 loss with uncertainty.
-
-    Please refer to `PGD <https://arxiv.org/abs/2107.14160>`_ and
-    `Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry
-    and Semantics <https://arxiv.org/abs/1705.07115>`_ for more details.
-
-    Args:
-        alpha (float, optional): The coefficient of log(sigma).
-            Defaults to 1.0.
-        beta (float, optional): The threshold in the piecewise function.
-            Defaults to 1.0.
-        reduction (str, optional): The method to reduce the loss.
-            Options are 'none', 'mean' and 'sum'. Defaults to 'mean'.
-        loss_weight (float, optional): The weight of loss. Defaults to 1.0
-    """
-
-    def __init__(self, alpha=1.0, beta=1.0, reduction='mean', loss_weight=1.0):
-        super(UncertainSmoothL1Loss, self).__init__()
-        assert reduction in ['none', 'sum', 'mean']
-        self.alpha = alpha
-        self.beta = beta
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self,
-                pred,
-                target,
-                sigma,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None,
-                **kwargs):
-        """Forward function.
-
-        Args:
-            pred (torch.Tensor): The prediction.
-            target (torch.Tensor): The learning target of the prediction.
-            sigma (torch.Tensor): The sigma for uncertainty.
-            weight (torch.Tensor, optional): The weight of loss for each
-                prediction. Defaults to None.
-            avg_factor (int, optional): Average factor that is used to average
-                the loss. Defaults to None.
-            reduction_override (str, optional): The reduction method used to
-                override the original reduction method of the loss.
-                Defaults to None.
-        """
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        loss_bbox = self.loss_weight * uncertain_smooth_l1_loss(
-            pred,
-            target,
-            weight,
-            sigma=sigma,
-            alpha=self.alpha,
-            beta=self.beta,
-            reduction=reduction,
-            avg_factor=avg_factor,
-            **kwargs)
-        return loss_bbox
-
-
-@LOSSES.register_module()
-class UncertainL1Loss(nn.Module):
-    """L1 loss with uncertainty.
-
-    Args:
-        alpha (float, optional): The coefficient of log(sigma).
-            Defaults to 1.0.
-        reduction (str, optional): The method to reduce the loss.
-            Options are 'none', 'mean' and 'sum'. Defaults to 'mean'.
-        loss_weight (float, optional): The weight of loss. Defaults to 1.0.
-    """
-
-    def __init__(self, alpha=1.0, reduction='mean', loss_weight=1.0):
-        super(UncertainL1Loss, self).__init__()
-        assert reduction in ['none', 'sum', 'mean']
-        self.alpha = alpha
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self,
-                pred,
-                target,
-                sigma,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None):
-        """Forward function.
-
-        Args:
-            pred (torch.Tensor): The prediction.
-            target (torch.Tensor): The learning target of the prediction.
-            sigma (torch.Tensor): The sigma for uncertainty.
-            weight (torch.Tensor, optional): The weight of loss for each
-                prediction. Defaults to None.
-            avg_factor (int, optional): Average factor that is used to average
-                the loss. Defaults to None.
-            reduction_override (str, optional): The reduction method used to
-                override the original reduction method of the loss.
-                Defaults to None.
-        """
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        loss_bbox = self.loss_weight * uncertain_l1_loss(
-            pred,
-            target,
-            weight,
-            sigma=sigma,
-            alpha=self.alpha,
-            reduction=reduction,
-            avg_factor=avg_factor)
-        return loss_bbox
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn as nn
+
+from mmdet.models.losses.utils import weighted_loss
+from ..builder import LOSSES
+
+
+@weighted_loss
+def uncertain_smooth_l1_loss(pred, target, sigma, alpha=1.0, beta=1.0):
+    """Smooth L1 loss with uncertainty.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+        sigma (torch.Tensor): The sigma for uncertainty.
+        alpha (float, optional): The coefficient of log(sigma).
+            Defaults to 1.0.
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert beta > 0
+    assert target.numel() > 0
+    assert pred.size() == target.size() == sigma.size(), 'The size of pred ' \
+        f'{pred.size()}, target {target.size()}, and sigma {sigma.size()} ' \
+        'are inconsistent.'
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    loss = torch.exp(-sigma) * loss + alpha * sigma
+
+    return loss
+
+
+@weighted_loss
+def uncertain_l1_loss(pred, target, sigma, alpha=1.0):
+    """L1 loss with uncertainty.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+        sigma (torch.Tensor): The sigma for uncertainty.
+        alpha (float, optional): The coefficient of log(sigma).
+            Defaults to 1.0.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert target.numel() > 0
+    assert pred.size() == target.size() == sigma.size(), 'The size of pred ' \
+        f'{pred.size()}, target {target.size()}, and sigma {sigma.size()} ' \
+        'are inconsistent.'
+    loss = torch.abs(pred - target)
+    loss = torch.exp(-sigma) * loss + alpha * sigma
+    return loss
+
+
+@LOSSES.register_module()
+class UncertainSmoothL1Loss(nn.Module):
+    r"""Smooth L1 loss with uncertainty.
+
+    Please refer to `PGD <https://arxiv.org/abs/2107.14160>`_ and
+    `Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry
+    and Semantics <https://arxiv.org/abs/1705.07115>`_ for more details.
+
+    Args:
+        alpha (float, optional): The coefficient of log(sigma).
+            Defaults to 1.0.
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+        reduction (str, optional): The method to reduce the loss.
+            Options are 'none', 'mean' and 'sum'. Defaults to 'mean'.
+        loss_weight (float, optional): The weight of loss. Defaults to 1.0
+    """
+
+    def __init__(self, alpha=1.0, beta=1.0, reduction='mean', loss_weight=1.0):
+        super(UncertainSmoothL1Loss, self).__init__()
+        assert reduction in ['none', 'sum', 'mean']
+        self.alpha = alpha
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                sigma,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            sigma (torch.Tensor): The sigma for uncertainty.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * uncertain_smooth_l1_loss(
+            pred,
+            target,
+            weight,
+            sigma=sigma,
+            alpha=self.alpha,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
+
+
+@LOSSES.register_module()
+class UncertainL1Loss(nn.Module):
+    """L1 loss with uncertainty.
+
+    Args:
+        alpha (float, optional): The coefficient of log(sigma).
+            Defaults to 1.0.
+        reduction (str, optional): The method to reduce the loss.
+            Options are 'none', 'mean' and 'sum'. Defaults to 'mean'.
+        loss_weight (float, optional): The weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self, alpha=1.0, reduction='mean', loss_weight=1.0):
+        super(UncertainL1Loss, self).__init__()
+        assert reduction in ['none', 'sum', 'mean']
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                sigma,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            sigma (torch.Tensor): The sigma for uncertainty.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * uncertain_l1_loss(
+            pred,
+            target,
+            weight,
+            sigma=sigma,
+            alpha=self.alpha,
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_bbox
diff --git a/mmdet3d/models/middle_encoders/__init__.py b/mmdet3d/models/middle_encoders/__init__.py
index d7b4435..7581d56 100644
--- a/mmdet3d/models/middle_encoders/__init__.py
+++ b/mmdet3d/models/middle_encoders/__init__.py
@@ -1,8 +1,8 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .pillar_scatter import PointPillarsScatter
-from .sparse_encoder import SparseEncoder, SparseEncoderSASSD
-from .sparse_unet import SparseUNet
-
-__all__ = [
-    'PointPillarsScatter', 'SparseEncoder', 'SparseEncoderSASSD', 'SparseUNet'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .pillar_scatter import PointPillarsScatter
+from .sparse_encoder import SparseEncoder, SparseEncoderSASSD
+from .sparse_unet import SparseUNet
+
+__all__ = [
+    'PointPillarsScatter', 'SparseEncoder', 'SparseEncoderSASSD', 'SparseUNet'
+]
diff --git a/mmdet3d/models/middle_encoders/pillar_scatter.py b/mmdet3d/models/middle_encoders/pillar_scatter.py
index 725ce29..74a1a2c 100644
--- a/mmdet3d/models/middle_encoders/pillar_scatter.py
+++ b/mmdet3d/models/middle_encoders/pillar_scatter.py
@@ -1,102 +1,102 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.runner import auto_fp16
-from torch import nn
-
-from ..builder import MIDDLE_ENCODERS
-
-
-@MIDDLE_ENCODERS.register_module()
-class PointPillarsScatter(nn.Module):
-    """Point Pillar's Scatter.
-
-    Converts learned features from dense tensor to sparse pseudo image.
-
-    Args:
-        in_channels (int): Channels of input features.
-        output_shape (list[int]): Required output shape of features.
-    """
-
-    def __init__(self, in_channels, output_shape):
-        super().__init__()
-        self.output_shape = output_shape
-        self.ny = output_shape[0]
-        self.nx = output_shape[1]
-        self.in_channels = in_channels
-        self.fp16_enabled = False
-
-    @auto_fp16(apply_to=('voxel_features', ))
-    def forward(self, voxel_features, coors, batch_size=None):
-        """Foraward function to scatter features."""
-        # TODO: rewrite the function in a batch manner
-        # no need to deal with different batch cases
-        if batch_size is not None:
-            return self.forward_batch(voxel_features, coors, batch_size)
-        else:
-            return self.forward_single(voxel_features, coors)
-
-    def forward_single(self, voxel_features, coors):
-        """Scatter features of single sample.
-
-        Args:
-            voxel_features (torch.Tensor): Voxel features in shape (N, M, C).
-            coors (torch.Tensor): Coordinates of each voxel.
-                The first column indicates the sample ID.
-        """
-        # Create the canvas for this sample
-        canvas = torch.zeros(
-            self.in_channels,
-            self.nx * self.ny,
-            dtype=voxel_features.dtype,
-            device=voxel_features.device)
-
-        indices = coors[:, 2] * self.nx + coors[:, 3]
-        indices = indices.long()
-        voxels = voxel_features.t()
-        # Now scatter the blob back to the canvas.
-        canvas[:, indices] = voxels
-        # Undo the column stacking to final 4-dim tensor
-        canvas = canvas.view(1, self.in_channels, self.ny, self.nx)
-        return canvas
-
-    def forward_batch(self, voxel_features, coors, batch_size):
-        """Scatter features of single sample.
-
-        Args:
-            voxel_features (torch.Tensor): Voxel features in shape (N, M, C).
-            coors (torch.Tensor): Coordinates of each voxel in shape (N, 4).
-                The first column indicates the sample ID.
-            batch_size (int): Number of samples in the current batch.
-        """
-        # batch_canvas will be the final output.
-        batch_canvas = []
-        for batch_itt in range(batch_size):
-            # Create the canvas for this sample
-            canvas = torch.zeros(
-                self.in_channels,
-                self.nx * self.ny,
-                dtype=voxel_features.dtype,
-                device=voxel_features.device)
-
-            # Only include non-empty pillars
-            batch_mask = coors[:, 0] == batch_itt
-            this_coors = coors[batch_mask, :]
-            indices = this_coors[:, 2] * self.nx + this_coors[:, 3]
-            indices = indices.type(torch.long)
-            voxels = voxel_features[batch_mask, :]
-            voxels = voxels.t()
-
-            # Now scatter the blob back to the canvas.
-            canvas[:, indices] = voxels
-
-            # Append to a list for later stacking.
-            batch_canvas.append(canvas)
-
-        # Stack to 3-dim tensor (batch-size, in_channels, nrows*ncols)
-        batch_canvas = torch.stack(batch_canvas, 0)
-
-        # Undo the column stacking to final 4-dim tensor
-        batch_canvas = batch_canvas.view(batch_size, self.in_channels, self.ny,
-                                         self.nx)
-
-        return batch_canvas
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import auto_fp16
+from torch import nn
+
+from ..builder import MIDDLE_ENCODERS
+
+
+@MIDDLE_ENCODERS.register_module()
+class PointPillarsScatter(nn.Module):
+    """Point Pillar's Scatter.
+
+    Converts learned features from dense tensor to sparse pseudo image.
+
+    Args:
+        in_channels (int): Channels of input features.
+        output_shape (list[int]): Required output shape of features.
+    """
+
+    def __init__(self, in_channels, output_shape):
+        super().__init__()
+        self.output_shape = output_shape
+        self.ny = output_shape[0]
+        self.nx = output_shape[1]
+        self.in_channels = in_channels
+        self.fp16_enabled = False
+
+    @auto_fp16(apply_to=('voxel_features', ))
+    def forward(self, voxel_features, coors, batch_size=None):
+        """Foraward function to scatter features."""
+        # TODO: rewrite the function in a batch manner
+        # no need to deal with different batch cases
+        if batch_size is not None:
+            return self.forward_batch(voxel_features, coors, batch_size)
+        else:
+            return self.forward_single(voxel_features, coors)
+
+    def forward_single(self, voxel_features, coors):
+        """Scatter features of single sample.
+
+        Args:
+            voxel_features (torch.Tensor): Voxel features in shape (N, M, C).
+            coors (torch.Tensor): Coordinates of each voxel.
+                The first column indicates the sample ID.
+        """
+        # Create the canvas for this sample
+        canvas = torch.zeros(
+            self.in_channels,
+            self.nx * self.ny,
+            dtype=voxel_features.dtype,
+            device=voxel_features.device)
+
+        indices = coors[:, 2] * self.nx + coors[:, 3]
+        indices = indices.long()
+        voxels = voxel_features.t()
+        # Now scatter the blob back to the canvas.
+        canvas[:, indices] = voxels
+        # Undo the column stacking to final 4-dim tensor
+        canvas = canvas.view(1, self.in_channels, self.ny, self.nx)
+        return canvas
+
+    def forward_batch(self, voxel_features, coors, batch_size):
+        """Scatter features of single sample.
+
+        Args:
+            voxel_features (torch.Tensor): Voxel features in shape (N, M, C).
+            coors (torch.Tensor): Coordinates of each voxel in shape (N, 4).
+                The first column indicates the sample ID.
+            batch_size (int): Number of samples in the current batch.
+        """
+        # batch_canvas will be the final output.
+        batch_canvas = []
+        for batch_itt in range(batch_size):
+            # Create the canvas for this sample
+            canvas = torch.zeros(
+                self.in_channels,
+                self.nx * self.ny,
+                dtype=voxel_features.dtype,
+                device=voxel_features.device)
+
+            # Only include non-empty pillars
+            batch_mask = coors[:, 0] == batch_itt
+            this_coors = coors[batch_mask, :]
+            indices = this_coors[:, 2] * self.nx + this_coors[:, 3]
+            indices = indices.type(torch.long)
+            voxels = voxel_features[batch_mask, :]
+            voxels = voxels.t()
+
+            # Now scatter the blob back to the canvas.
+            canvas[:, indices] = voxels
+
+            # Append to a list for later stacking.
+            batch_canvas.append(canvas)
+
+        # Stack to 3-dim tensor (batch-size, in_channels, nrows*ncols)
+        batch_canvas = torch.stack(batch_canvas, 0)
+
+        # Undo the column stacking to final 4-dim tensor
+        batch_canvas = batch_canvas.view(batch_size, self.in_channels, self.ny,
+                                         self.nx)
+
+        return batch_canvas
diff --git a/mmdet3d/models/middle_encoders/sparse_encoder.py b/mmdet3d/models/middle_encoders/sparse_encoder.py
index 83a7a30..9b3a38d 100644
--- a/mmdet3d/models/middle_encoders/sparse_encoder.py
+++ b/mmdet3d/models/middle_encoders/sparse_encoder.py
@@ -1,491 +1,491 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.ops import points_in_boxes_all, three_interpolate, three_nn
-from mmcv.runner import auto_fp16
-from torch import nn as nn
-
-from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule
-from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE
-from mmdet.models.losses import sigmoid_focal_loss, smooth_l1_loss
-from ..builder import MIDDLE_ENCODERS
-
-if IS_SPCONV2_AVAILABLE:
-    from spconv.pytorch import SparseConvTensor, SparseSequential
-else:
-    from mmcv.ops import SparseConvTensor, SparseSequential
-
-
-@MIDDLE_ENCODERS.register_module()
-class SparseEncoder(nn.Module):
-    r"""Sparse encoder for SECOND and Part-A2.
-
-    Args:
-        in_channels (int): The number of input channels.
-        sparse_shape (list[int]): The sparse shape of input tensor.
-        order (list[str], optional): Order of conv module.
-            Defaults to ('conv', 'norm', 'act').
-        norm_cfg (dict, optional): Config of normalization layer. Defaults to
-            dict(type='BN1d', eps=1e-3, momentum=0.01).
-        base_channels (int, optional): Out channels for conv_input layer.
-            Defaults to 16.
-        output_channels (int, optional): Out channels for conv_out layer.
-            Defaults to 128.
-        encoder_channels (tuple[tuple[int]], optional):
-            Convolutional channels of each encode block.
-            Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
-        encoder_paddings (tuple[tuple[int]], optional):
-            Paddings of each encode block.
-            Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)).
-        block_type (str, optional): Type of the block to use.
-            Defaults to 'conv_module'.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 sparse_shape,
-                 order=('conv', 'norm', 'act'),
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 base_channels=16,
-                 output_channels=128,
-                 encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
-                                                                        64)),
-                 encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
-                                                                 1)),
-                 block_type='conv_module'):
-        super().__init__()
-        assert block_type in ['conv_module', 'basicblock']
-        self.sparse_shape = sparse_shape
-        self.in_channels = in_channels
-        self.order = order
-        self.base_channels = base_channels
-        self.output_channels = output_channels
-        self.encoder_channels = encoder_channels
-        self.encoder_paddings = encoder_paddings
-        self.stage_num = len(self.encoder_channels)
-        self.fp16_enabled = False
-        # Spconv init all weight on its own
-
-        assert isinstance(order, tuple) and len(order) == 3
-        assert set(order) == {'conv', 'norm', 'act'}
-
-        if self.order[0] != 'conv':  # pre activate
-            self.conv_input = make_sparse_convmodule(
-                in_channels,
-                self.base_channels,
-                3,
-                norm_cfg=norm_cfg,
-                padding=1,
-                indice_key='subm1',
-                conv_type='SubMConv3d',
-                order=('conv', ))
-        else:  # post activate
-            self.conv_input = make_sparse_convmodule(
-                in_channels,
-                self.base_channels,
-                3,
-                norm_cfg=norm_cfg,
-                padding=1,
-                indice_key='subm1',
-                conv_type='SubMConv3d')
-
-        encoder_out_channels = self.make_encoder_layers(
-            make_sparse_convmodule,
-            norm_cfg,
-            self.base_channels,
-            block_type=block_type)
-
-        self.conv_out = make_sparse_convmodule(
-            encoder_out_channels,
-            self.output_channels,
-            kernel_size=(3, 1, 1),
-            stride=(2, 1, 1),
-            norm_cfg=norm_cfg,
-            padding=0,
-            indice_key='spconv_down2',
-            conv_type='SparseConv3d')
-
-    @auto_fp16(apply_to=('voxel_features', ))
-    def forward(self, voxel_features, coors, batch_size):
-        """Forward of SparseEncoder.
-
-        Args:
-            voxel_features (torch.Tensor): Voxel features in shape (N, C).
-            coors (torch.Tensor): Coordinates in shape (N, 4),
-                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
-            batch_size (int): Batch size.
-
-        Returns:
-            dict: Backbone features.
-        """
-        coors = coors.int()
-        input_sp_tensor = SparseConvTensor(voxel_features, coors,
-                                           self.sparse_shape, batch_size)
-        x = self.conv_input(input_sp_tensor)
-
-        encode_features = []
-        for encoder_layer in self.encoder_layers:
-            x = encoder_layer(x)
-            encode_features.append(x)
-
-        # for detection head
-        # [200, 176, 5] -> [200, 176, 2]
-        out = self.conv_out(encode_features[-1])
-        spatial_features = out.dense()
-
-        N, C, D, H, W = spatial_features.shape
-        spatial_features = spatial_features.view(N, C * D, H, W)
-
-        return spatial_features
-
-    def make_encoder_layers(self,
-                            make_block,
-                            norm_cfg,
-                            in_channels,
-                            block_type='conv_module',
-                            conv_cfg=dict(type='SubMConv3d')):
-        """make encoder layers using sparse convs.
-
-        Args:
-            make_block (method): A bounded function to build blocks.
-            norm_cfg (dict[str]): Config of normalization layer.
-            in_channels (int): The number of encoder input channels.
-            block_type (str, optional): Type of the block to use.
-                Defaults to 'conv_module'.
-            conv_cfg (dict, optional): Config of conv layer. Defaults to
-                dict(type='SubMConv3d').
-
-        Returns:
-            int: The number of encoder output channels.
-        """
-        assert block_type in ['conv_module', 'basicblock']
-        self.encoder_layers = SparseSequential()
-
-        for i, blocks in enumerate(self.encoder_channels):
-            blocks_list = []
-            for j, out_channels in enumerate(tuple(blocks)):
-                padding = tuple(self.encoder_paddings[i])[j]
-                # each stage started with a spconv layer
-                # except the first stage
-                if i != 0 and j == 0 and block_type == 'conv_module':
-                    blocks_list.append(
-                        make_block(
-                            in_channels,
-                            out_channels,
-                            3,
-                            norm_cfg=norm_cfg,
-                            stride=2,
-                            padding=padding,
-                            indice_key=f'spconv{i + 1}',
-                            conv_type='SparseConv3d'))
-                elif block_type == 'basicblock':
-                    if j == len(blocks) - 1 and i != len(
-                            self.encoder_channels) - 1:
-                        blocks_list.append(
-                            make_block(
-                                in_channels,
-                                out_channels,
-                                3,
-                                norm_cfg=norm_cfg,
-                                stride=2,
-                                padding=padding,
-                                indice_key=f'spconv{i + 1}',
-                                conv_type='SparseConv3d'))
-                    else:
-                        blocks_list.append(
-                            SparseBasicBlock(
-                                out_channels,
-                                out_channels,
-                                norm_cfg=norm_cfg,
-                                conv_cfg=conv_cfg))
-                else:
-                    blocks_list.append(
-                        make_block(
-                            in_channels,
-                            out_channels,
-                            3,
-                            norm_cfg=norm_cfg,
-                            padding=padding,
-                            indice_key=f'subm{i + 1}',
-                            conv_type='SubMConv3d'))
-                in_channels = out_channels
-            stage_name = f'encoder_layer{i + 1}'
-            stage_layers = SparseSequential(*blocks_list)
-            self.encoder_layers.add_module(stage_name, stage_layers)
-        return out_channels
-
-
-@MIDDLE_ENCODERS.register_module()
-class SparseEncoderSASSD(SparseEncoder):
-    r"""Sparse encoder for `SASSD <https://github.com/skyhehe123/SA-SSD>`_
-
-    Args:
-        in_channels (int): The number of input channels.
-        sparse_shape (list[int]): The sparse shape of input tensor.
-        order (list[str], optional): Order of conv module.
-            Defaults to ('conv', 'norm', 'act').
-        norm_cfg (dict, optional): Config of normalization layer. Defaults to
-            dict(type='BN1d', eps=1e-3, momentum=0.01).
-        base_channels (int, optional): Out channels for conv_input layer.
-            Defaults to 16.
-        output_channels (int, optional): Out channels for conv_out layer.
-            Defaults to 128.
-        encoder_channels (tuple[tuple[int]], optional):
-            Convolutional channels of each encode block.
-            Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
-        encoder_paddings (tuple[tuple[int]], optional):
-            Paddings of each encode block.
-            Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)).
-        block_type (str, optional): Type of the block to use.
-            Defaults to 'conv_module'.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 sparse_shape,
-                 order=('conv', 'norm', 'act'),
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 base_channels=16,
-                 output_channels=128,
-                 encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
-                                                                        64)),
-                 encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
-                                                                 1)),
-                 block_type='conv_module'):
-        super(SparseEncoderSASSD, self).__init__(
-            in_channels=in_channels,
-            sparse_shape=sparse_shape,
-            order=order,
-            norm_cfg=norm_cfg,
-            base_channels=base_channels,
-            output_channels=output_channels,
-            encoder_channels=encoder_channels,
-            encoder_paddings=encoder_paddings,
-            block_type=block_type)
-
-        self.point_fc = nn.Linear(112, 64, bias=False)
-        self.point_cls = nn.Linear(64, 1, bias=False)
-        self.point_reg = nn.Linear(64, 3, bias=False)
-
-    @auto_fp16(apply_to=('voxel_features', ))
-    def forward(self, voxel_features, coors, batch_size, test_mode=False):
-        """Forward of SparseEncoder.
-
-        Args:
-            voxel_features (torch.Tensor): Voxel features in shape (N, C).
-            coors (torch.Tensor): Coordinates in shape (N, 4),
-                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
-            batch_size (int): Batch size.
-            test_mode (bool, optional): Whether in test mode.
-                Defaults to False.
-
-        Returns:
-            dict: Backbone features.
-            tuple[torch.Tensor]: Mean feature value of the points,
-                Classificaion result of the points,
-                Regression offsets of the points.
-        """
-        coors = coors.int()
-        input_sp_tensor = SparseConvTensor(voxel_features, coors,
-                                           self.sparse_shape, batch_size)
-        x = self.conv_input(input_sp_tensor)
-
-        encode_features = []
-        for encoder_layer in self.encoder_layers:
-            x = encoder_layer(x)
-            encode_features.append(x)
-
-        # for detection head
-        # [200, 176, 5] -> [200, 176, 2]
-        out = self.conv_out(encode_features[-1])
-        spatial_features = out.dense()
-
-        N, C, D, H, W = spatial_features.shape
-        spatial_features = spatial_features.view(N, C * D, H, W)
-
-        if test_mode:
-            return spatial_features, None
-
-        points_mean = torch.zeros_like(voxel_features)
-        points_mean[:, 0] = coors[:, 0]
-        points_mean[:, 1:] = voxel_features[:, :3]
-
-        # auxiliary network
-        p0 = self.make_auxiliary_points(
-            encode_features[0],
-            points_mean,
-            offset=(0, -40., -3.),
-            voxel_size=(.1, .1, .2))
-
-        p1 = self.make_auxiliary_points(
-            encode_features[1],
-            points_mean,
-            offset=(0, -40., -3.),
-            voxel_size=(.2, .2, .4))
-
-        p2 = self.make_auxiliary_points(
-            encode_features[2],
-            points_mean,
-            offset=(0, -40., -3.),
-            voxel_size=(.4, .4, .8))
-
-        pointwise = torch.cat([p0, p1, p2], dim=-1)
-        pointwise = self.point_fc(pointwise)
-        point_cls = self.point_cls(pointwise)
-        point_reg = self.point_reg(pointwise)
-        point_misc = (points_mean, point_cls, point_reg)
-
-        return spatial_features, point_misc
-
-    def get_auxiliary_targets(self, nxyz, gt_boxes3d, enlarge=1.0):
-        """Get auxiliary target.
-
-        Args:
-            nxyz (torch.Tensor): Mean features of the points.
-            gt_boxes3d (torch.Tensor): Coordinates in shape (N, 4),
-                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
-            enlarge (int, optional): Enlaged scale. Defaults to 1.0.
-
-        Returns:
-            tuple[torch.Tensor]: Label of the points and
-                center offsets of the points.
-        """
-        center_offsets = list()
-        pts_labels = list()
-        for i in range(len(gt_boxes3d)):
-            boxes3d = gt_boxes3d[i].tensor.cpu()
-            idx = torch.nonzero(nxyz[:, 0] == i).view(-1)
-            new_xyz = nxyz[idx, 1:].cpu()
-
-            boxes3d[:, 3:6] *= enlarge
-
-            pts_in_flag, center_offset = self.calculate_pts_offsets(
-                new_xyz, boxes3d)
-            pts_label = pts_in_flag.max(0)[0].byte()
-            pts_labels.append(pts_label)
-            center_offsets.append(center_offset)
-
-        center_offsets = torch.cat(center_offsets).cuda()
-        pts_labels = torch.cat(pts_labels).to(center_offsets.device)
-
-        return pts_labels, center_offsets
-
-    def calculate_pts_offsets(self, points, boxes):
-        """Find all boxes in which each point is, as well as the offsets from
-        the box centers.
-
-        Args:
-            points (torch.Tensor): [M, 3], [x, y, z] in LiDAR/DEPTH coordinate
-            boxes (torch.Tensor): [T, 7],
-                num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
-                (x, y, z) is the bottom center.
-
-        Returns:
-            tuple[torch.Tensor]: Point indices of boxes with the shape of
-                (T, M). Default background = 0.
-                And offsets from the box centers of points,
-                if it belows to the box, with the shape of (M, 3).
-                Default background = 0.
-        """
-        boxes_num = len(boxes)
-        pts_num = len(points)
-        points = points.cuda()
-        boxes = boxes.to(points.device)
-
-        box_idxs_of_pts = points_in_boxes_all(points[None, ...], boxes[None,
-                                                                       ...])
-
-        pts_indices = box_idxs_of_pts.squeeze(0).transpose(0, 1)
-
-        center_offsets = torch.zeros_like(points).to(points.device)
-
-        for i in range(boxes_num):
-            for j in range(pts_num):
-                if pts_indices[i][j] == 1:
-                    center_offsets[j][0] = points[j][0] - boxes[i][0]
-                    center_offsets[j][1] = points[j][1] - boxes[i][1]
-                    center_offsets[j][2] = (
-                        points[j][2] - (boxes[i][2] + boxes[i][2] / 2.0))
-        return pts_indices.cpu(), center_offsets.cpu()
-
-    def aux_loss(self, points, point_cls, point_reg, gt_bboxes):
-        """Calculate auxiliary loss.
-
-        Args:
-            points (torch.Tensor): Mean feature value of the points.
-            point_cls (torch.Tensor): Classificaion result of the points.
-            point_reg (torch.Tensor): Regression offsets of the points.
-            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes for each sample.
-
-        Returns:
-            dict: Backbone features.
-        """
-        num_boxes = len(gt_bboxes)
-
-        pts_labels, center_targets = self.get_auxiliary_targets(
-            points, gt_bboxes)
-
-        rpn_cls_target = pts_labels.long()
-        pos = (pts_labels > 0).float()
-        neg = (pts_labels == 0).float()
-
-        pos_normalizer = pos.sum().clamp(min=1.0)
-
-        cls_weights = pos + neg
-        reg_weights = pos
-        reg_weights = reg_weights / pos_normalizer
-
-        aux_loss_cls = sigmoid_focal_loss(
-            point_cls,
-            rpn_cls_target,
-            weight=cls_weights,
-            avg_factor=pos_normalizer)
-
-        aux_loss_cls /= num_boxes
-
-        weight = reg_weights[..., None]
-        aux_loss_reg = smooth_l1_loss(point_reg, center_targets, beta=1 / 9.)
-        aux_loss_reg = torch.sum(aux_loss_reg * weight)[None]
-        aux_loss_reg /= num_boxes
-
-        aux_loss_cls, aux_loss_reg = [aux_loss_cls], [aux_loss_reg]
-
-        return dict(aux_loss_cls=aux_loss_cls, aux_loss_reg=aux_loss_reg)
-
-    def make_auxiliary_points(self,
-                              source_tensor,
-                              target,
-                              offset=(0., -40., -3.),
-                              voxel_size=(.05, .05, .1)):
-        """Make auxiliary points for loss computation.
-
-        Args:
-            source_tensor (torch.Tensor): (M, C) features to be propigated.
-            target (torch.Tensor): (N, 4) bxyz positions of the
-                target features.
-            offset (tuple[float], optional): Voxelization offset.
-                Defaults to (0., -40., -3.)
-            voxel_size (tuple[float], optional): Voxelization size.
-                Defaults to (.05, .05, .1)
-
-        Returns:
-            torch.Tensor: (N, C) tensor of the features of the target features.
-        """
-        # Tansfer tensor to points
-        source = source_tensor.indices.float()
-        offset = torch.Tensor(offset).to(source.device)
-        voxel_size = torch.Tensor(voxel_size).to(source.device)
-        source[:, 1:] = (
-            source[:, [3, 2, 1]] * voxel_size + offset + .5 * voxel_size)
-
-        source_feats = source_tensor.features[None, ...].transpose(1, 2)
-
-        # Interplate auxiliary points
-        dist, idx = three_nn(target[None, ...], source[None, ...])
-        dist_recip = 1.0 / (dist + 1e-8)
-        norm = torch.sum(dist_recip, dim=2, keepdim=True)
-        weight = dist_recip / norm
-        new_features = three_interpolate(source_feats.contiguous(), idx,
-                                         weight)
-
-        return new_features.squeeze(0).transpose(0, 1)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import points_in_boxes_all, three_interpolate, three_nn
+from mmcv.runner import auto_fp16
+from torch import nn as nn
+
+from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule
+from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE
+from mmdet.models.losses import sigmoid_focal_loss, smooth_l1_loss
+from ..builder import MIDDLE_ENCODERS
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import SparseConvTensor, SparseSequential
+else:
+    from mmcv.ops import SparseConvTensor, SparseSequential
+
+
+@MIDDLE_ENCODERS.register_module()
+class SparseEncoder(nn.Module):
+    r"""Sparse encoder for SECOND and Part-A2.
+
+    Args:
+        in_channels (int): The number of input channels.
+        sparse_shape (list[int]): The sparse shape of input tensor.
+        order (list[str], optional): Order of conv module.
+            Defaults to ('conv', 'norm', 'act').
+        norm_cfg (dict, optional): Config of normalization layer. Defaults to
+            dict(type='BN1d', eps=1e-3, momentum=0.01).
+        base_channels (int, optional): Out channels for conv_input layer.
+            Defaults to 16.
+        output_channels (int, optional): Out channels for conv_out layer.
+            Defaults to 128.
+        encoder_channels (tuple[tuple[int]], optional):
+            Convolutional channels of each encode block.
+            Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
+        encoder_paddings (tuple[tuple[int]], optional):
+            Paddings of each encode block.
+            Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)).
+        block_type (str, optional): Type of the block to use.
+            Defaults to 'conv_module'.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 sparse_shape,
+                 order=('conv', 'norm', 'act'),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 base_channels=16,
+                 output_channels=128,
+                 encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
+                                                                        64)),
+                 encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
+                                                                 1)),
+                 block_type='conv_module'):
+        super().__init__()
+        assert block_type in ['conv_module', 'basicblock']
+        self.sparse_shape = sparse_shape
+        self.in_channels = in_channels
+        self.order = order
+        self.base_channels = base_channels
+        self.output_channels = output_channels
+        self.encoder_channels = encoder_channels
+        self.encoder_paddings = encoder_paddings
+        self.stage_num = len(self.encoder_channels)
+        self.fp16_enabled = False
+        # Spconv init all weight on its own
+
+        assert isinstance(order, tuple) and len(order) == 3
+        assert set(order) == {'conv', 'norm', 'act'}
+
+        if self.order[0] != 'conv':  # pre activate
+            self.conv_input = make_sparse_convmodule(
+                in_channels,
+                self.base_channels,
+                3,
+                norm_cfg=norm_cfg,
+                padding=1,
+                indice_key='subm1',
+                conv_type='SubMConv3d',
+                order=('conv', ))
+        else:  # post activate
+            self.conv_input = make_sparse_convmodule(
+                in_channels,
+                self.base_channels,
+                3,
+                norm_cfg=norm_cfg,
+                padding=1,
+                indice_key='subm1',
+                conv_type='SubMConv3d')
+
+        encoder_out_channels = self.make_encoder_layers(
+            make_sparse_convmodule,
+            norm_cfg,
+            self.base_channels,
+            block_type=block_type)
+
+        self.conv_out = make_sparse_convmodule(
+            encoder_out_channels,
+            self.output_channels,
+            kernel_size=(3, 1, 1),
+            stride=(2, 1, 1),
+            norm_cfg=norm_cfg,
+            padding=0,
+            indice_key='spconv_down2',
+            conv_type='SparseConv3d')
+
+    @auto_fp16(apply_to=('voxel_features', ))
+    def forward(self, voxel_features, coors, batch_size):
+        """Forward of SparseEncoder.
+
+        Args:
+            voxel_features (torch.Tensor): Voxel features in shape (N, C).
+            coors (torch.Tensor): Coordinates in shape (N, 4),
+                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
+            batch_size (int): Batch size.
+
+        Returns:
+            dict: Backbone features.
+        """
+        coors = coors.int()
+        input_sp_tensor = SparseConvTensor(voxel_features, coors,
+                                           self.sparse_shape, batch_size)
+        x = self.conv_input(input_sp_tensor)
+
+        encode_features = []
+        for encoder_layer in self.encoder_layers:
+            x = encoder_layer(x)
+            encode_features.append(x)
+
+        # for detection head
+        # [200, 176, 5] -> [200, 176, 2]
+        out = self.conv_out(encode_features[-1])
+        spatial_features = out.dense()
+
+        N, C, D, H, W = spatial_features.shape
+        spatial_features = spatial_features.view(N, C * D, H, W)
+
+        return spatial_features
+
+    def make_encoder_layers(self,
+                            make_block,
+                            norm_cfg,
+                            in_channels,
+                            block_type='conv_module',
+                            conv_cfg=dict(type='SubMConv3d')):
+        """make encoder layers using sparse convs.
+
+        Args:
+            make_block (method): A bounded function to build blocks.
+            norm_cfg (dict[str]): Config of normalization layer.
+            in_channels (int): The number of encoder input channels.
+            block_type (str, optional): Type of the block to use.
+                Defaults to 'conv_module'.
+            conv_cfg (dict, optional): Config of conv layer. Defaults to
+                dict(type='SubMConv3d').
+
+        Returns:
+            int: The number of encoder output channels.
+        """
+        assert block_type in ['conv_module', 'basicblock']
+        self.encoder_layers = SparseSequential()
+
+        for i, blocks in enumerate(self.encoder_channels):
+            blocks_list = []
+            for j, out_channels in enumerate(tuple(blocks)):
+                padding = tuple(self.encoder_paddings[i])[j]
+                # each stage started with a spconv layer
+                # except the first stage
+                if i != 0 and j == 0 and block_type == 'conv_module':
+                    blocks_list.append(
+                        make_block(
+                            in_channels,
+                            out_channels,
+                            3,
+                            norm_cfg=norm_cfg,
+                            stride=2,
+                            padding=padding,
+                            indice_key=f'spconv{i + 1}',
+                            conv_type='SparseConv3d'))
+                elif block_type == 'basicblock':
+                    if j == len(blocks) - 1 and i != len(
+                            self.encoder_channels) - 1:
+                        blocks_list.append(
+                            make_block(
+                                in_channels,
+                                out_channels,
+                                3,
+                                norm_cfg=norm_cfg,
+                                stride=2,
+                                padding=padding,
+                                indice_key=f'spconv{i + 1}',
+                                conv_type='SparseConv3d'))
+                    else:
+                        blocks_list.append(
+                            SparseBasicBlock(
+                                out_channels,
+                                out_channels,
+                                norm_cfg=norm_cfg,
+                                conv_cfg=conv_cfg))
+                else:
+                    blocks_list.append(
+                        make_block(
+                            in_channels,
+                            out_channels,
+                            3,
+                            norm_cfg=norm_cfg,
+                            padding=padding,
+                            indice_key=f'subm{i + 1}',
+                            conv_type='SubMConv3d'))
+                in_channels = out_channels
+            stage_name = f'encoder_layer{i + 1}'
+            stage_layers = SparseSequential(*blocks_list)
+            self.encoder_layers.add_module(stage_name, stage_layers)
+        return out_channels
+
+
+@MIDDLE_ENCODERS.register_module()
+class SparseEncoderSASSD(SparseEncoder):
+    r"""Sparse encoder for `SASSD <https://github.com/skyhehe123/SA-SSD>`_
+
+    Args:
+        in_channels (int): The number of input channels.
+        sparse_shape (list[int]): The sparse shape of input tensor.
+        order (list[str], optional): Order of conv module.
+            Defaults to ('conv', 'norm', 'act').
+        norm_cfg (dict, optional): Config of normalization layer. Defaults to
+            dict(type='BN1d', eps=1e-3, momentum=0.01).
+        base_channels (int, optional): Out channels for conv_input layer.
+            Defaults to 16.
+        output_channels (int, optional): Out channels for conv_out layer.
+            Defaults to 128.
+        encoder_channels (tuple[tuple[int]], optional):
+            Convolutional channels of each encode block.
+            Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
+        encoder_paddings (tuple[tuple[int]], optional):
+            Paddings of each encode block.
+            Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)).
+        block_type (str, optional): Type of the block to use.
+            Defaults to 'conv_module'.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 sparse_shape,
+                 order=('conv', 'norm', 'act'),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 base_channels=16,
+                 output_channels=128,
+                 encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
+                                                                        64)),
+                 encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
+                                                                 1)),
+                 block_type='conv_module'):
+        super(SparseEncoderSASSD, self).__init__(
+            in_channels=in_channels,
+            sparse_shape=sparse_shape,
+            order=order,
+            norm_cfg=norm_cfg,
+            base_channels=base_channels,
+            output_channels=output_channels,
+            encoder_channels=encoder_channels,
+            encoder_paddings=encoder_paddings,
+            block_type=block_type)
+
+        self.point_fc = nn.Linear(112, 64, bias=False)
+        self.point_cls = nn.Linear(64, 1, bias=False)
+        self.point_reg = nn.Linear(64, 3, bias=False)
+
+    @auto_fp16(apply_to=('voxel_features', ))
+    def forward(self, voxel_features, coors, batch_size, test_mode=False):
+        """Forward of SparseEncoder.
+
+        Args:
+            voxel_features (torch.Tensor): Voxel features in shape (N, C).
+            coors (torch.Tensor): Coordinates in shape (N, 4),
+                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
+            batch_size (int): Batch size.
+            test_mode (bool, optional): Whether in test mode.
+                Defaults to False.
+
+        Returns:
+            dict: Backbone features.
+            tuple[torch.Tensor]: Mean feature value of the points,
+                Classificaion result of the points,
+                Regression offsets of the points.
+        """
+        coors = coors.int()
+        input_sp_tensor = SparseConvTensor(voxel_features, coors,
+                                           self.sparse_shape, batch_size)
+        x = self.conv_input(input_sp_tensor)
+
+        encode_features = []
+        for encoder_layer in self.encoder_layers:
+            x = encoder_layer(x)
+            encode_features.append(x)
+
+        # for detection head
+        # [200, 176, 5] -> [200, 176, 2]
+        out = self.conv_out(encode_features[-1])
+        spatial_features = out.dense()
+
+        N, C, D, H, W = spatial_features.shape
+        spatial_features = spatial_features.view(N, C * D, H, W)
+
+        if test_mode:
+            return spatial_features, None
+
+        points_mean = torch.zeros_like(voxel_features)
+        points_mean[:, 0] = coors[:, 0]
+        points_mean[:, 1:] = voxel_features[:, :3]
+
+        # auxiliary network
+        p0 = self.make_auxiliary_points(
+            encode_features[0],
+            points_mean,
+            offset=(0, -40., -3.),
+            voxel_size=(.1, .1, .2))
+
+        p1 = self.make_auxiliary_points(
+            encode_features[1],
+            points_mean,
+            offset=(0, -40., -3.),
+            voxel_size=(.2, .2, .4))
+
+        p2 = self.make_auxiliary_points(
+            encode_features[2],
+            points_mean,
+            offset=(0, -40., -3.),
+            voxel_size=(.4, .4, .8))
+
+        pointwise = torch.cat([p0, p1, p2], dim=-1)
+        pointwise = self.point_fc(pointwise)
+        point_cls = self.point_cls(pointwise)
+        point_reg = self.point_reg(pointwise)
+        point_misc = (points_mean, point_cls, point_reg)
+
+        return spatial_features, point_misc
+
+    def get_auxiliary_targets(self, nxyz, gt_boxes3d, enlarge=1.0):
+        """Get auxiliary target.
+
+        Args:
+            nxyz (torch.Tensor): Mean features of the points.
+            gt_boxes3d (torch.Tensor): Coordinates in shape (N, 4),
+                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
+            enlarge (int, optional): Enlaged scale. Defaults to 1.0.
+
+        Returns:
+            tuple[torch.Tensor]: Label of the points and
+                center offsets of the points.
+        """
+        center_offsets = list()
+        pts_labels = list()
+        for i in range(len(gt_boxes3d)):
+            boxes3d = gt_boxes3d[i].tensor.cpu()
+            idx = torch.nonzero(nxyz[:, 0] == i).view(-1)
+            new_xyz = nxyz[idx, 1:].cpu()
+
+            boxes3d[:, 3:6] *= enlarge
+
+            pts_in_flag, center_offset = self.calculate_pts_offsets(
+                new_xyz, boxes3d)
+            pts_label = pts_in_flag.max(0)[0].byte()
+            pts_labels.append(pts_label)
+            center_offsets.append(center_offset)
+
+        center_offsets = torch.cat(center_offsets).cuda()
+        pts_labels = torch.cat(pts_labels).to(center_offsets.device)
+
+        return pts_labels, center_offsets
+
+    def calculate_pts_offsets(self, points, boxes):
+        """Find all boxes in which each point is, as well as the offsets from
+        the box centers.
+
+        Args:
+            points (torch.Tensor): [M, 3], [x, y, z] in LiDAR/DEPTH coordinate
+            boxes (torch.Tensor): [T, 7],
+                num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
+                (x, y, z) is the bottom center.
+
+        Returns:
+            tuple[torch.Tensor]: Point indices of boxes with the shape of
+                (T, M). Default background = 0.
+                And offsets from the box centers of points,
+                if it belows to the box, with the shape of (M, 3).
+                Default background = 0.
+        """
+        boxes_num = len(boxes)
+        pts_num = len(points)
+        points = points.cuda()
+        boxes = boxes.to(points.device)
+
+        box_idxs_of_pts = points_in_boxes_all(points[None, ...], boxes[None,
+                                                                       ...])
+
+        pts_indices = box_idxs_of_pts.squeeze(0).transpose(0, 1)
+
+        center_offsets = torch.zeros_like(points).to(points.device)
+
+        for i in range(boxes_num):
+            for j in range(pts_num):
+                if pts_indices[i][j] == 1:
+                    center_offsets[j][0] = points[j][0] - boxes[i][0]
+                    center_offsets[j][1] = points[j][1] - boxes[i][1]
+                    center_offsets[j][2] = (
+                        points[j][2] - (boxes[i][2] + boxes[i][2] / 2.0))
+        return pts_indices.cpu(), center_offsets.cpu()
+
+    def aux_loss(self, points, point_cls, point_reg, gt_bboxes):
+        """Calculate auxiliary loss.
+
+        Args:
+            points (torch.Tensor): Mean feature value of the points.
+            point_cls (torch.Tensor): Classificaion result of the points.
+            point_reg (torch.Tensor): Regression offsets of the points.
+            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+
+        Returns:
+            dict: Backbone features.
+        """
+        num_boxes = len(gt_bboxes)
+
+        pts_labels, center_targets = self.get_auxiliary_targets(
+            points, gt_bboxes)
+
+        rpn_cls_target = pts_labels.long()
+        pos = (pts_labels > 0).float()
+        neg = (pts_labels == 0).float()
+
+        pos_normalizer = pos.sum().clamp(min=1.0)
+
+        cls_weights = pos + neg
+        reg_weights = pos
+        reg_weights = reg_weights / pos_normalizer
+
+        aux_loss_cls = sigmoid_focal_loss(
+            point_cls,
+            rpn_cls_target,
+            weight=cls_weights,
+            avg_factor=pos_normalizer)
+
+        aux_loss_cls /= num_boxes
+
+        weight = reg_weights[..., None]
+        aux_loss_reg = smooth_l1_loss(point_reg, center_targets, beta=1 / 9.)
+        aux_loss_reg = torch.sum(aux_loss_reg * weight)[None]
+        aux_loss_reg /= num_boxes
+
+        aux_loss_cls, aux_loss_reg = [aux_loss_cls], [aux_loss_reg]
+
+        return dict(aux_loss_cls=aux_loss_cls, aux_loss_reg=aux_loss_reg)
+
+    def make_auxiliary_points(self,
+                              source_tensor,
+                              target,
+                              offset=(0., -40., -3.),
+                              voxel_size=(.05, .05, .1)):
+        """Make auxiliary points for loss computation.
+
+        Args:
+            source_tensor (torch.Tensor): (M, C) features to be propigated.
+            target (torch.Tensor): (N, 4) bxyz positions of the
+                target features.
+            offset (tuple[float], optional): Voxelization offset.
+                Defaults to (0., -40., -3.)
+            voxel_size (tuple[float], optional): Voxelization size.
+                Defaults to (.05, .05, .1)
+
+        Returns:
+            torch.Tensor: (N, C) tensor of the features of the target features.
+        """
+        # Tansfer tensor to points
+        source = source_tensor.indices.float()
+        offset = torch.Tensor(offset).to(source.device)
+        voxel_size = torch.Tensor(voxel_size).to(source.device)
+        source[:, 1:] = (
+            source[:, [3, 2, 1]] * voxel_size + offset + .5 * voxel_size)
+
+        source_feats = source_tensor.features[None, ...].transpose(1, 2)
+
+        # Interplate auxiliary points
+        dist, idx = three_nn(target[None, ...], source[None, ...])
+        dist_recip = 1.0 / (dist + 1e-8)
+        norm = torch.sum(dist_recip, dim=2, keepdim=True)
+        weight = dist_recip / norm
+        new_features = three_interpolate(source_feats.contiguous(), idx,
+                                         weight)
+
+        return new_features.squeeze(0).transpose(0, 1)
diff --git a/mmdet3d/models/middle_encoders/sparse_unet.py b/mmdet3d/models/middle_encoders/sparse_unet.py
index 005e34e..ff75dc8 100644
--- a/mmdet3d/models/middle_encoders/sparse_unet.py
+++ b/mmdet3d/models/middle_encoders/sparse_unet.py
@@ -1,300 +1,300 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE
-
-if IS_SPCONV2_AVAILABLE:
-    from spconv.pytorch import SparseConvTensor, SparseSequential
-else:
-    from mmcv.ops import SparseConvTensor, SparseSequential
-
-from mmcv.runner import BaseModule, auto_fp16
-
-from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule
-from mmdet3d.ops.sparse_block import replace_feature
-from ..builder import MIDDLE_ENCODERS
-
-
-@MIDDLE_ENCODERS.register_module()
-class SparseUNet(BaseModule):
-    r"""SparseUNet for PartA^2.
-
-    See the `paper <https://arxiv.org/abs/1907.03670>`_ for more details.
-
-    Args:
-        in_channels (int): The number of input channels.
-        sparse_shape (list[int]): The sparse shape of input tensor.
-        norm_cfg (dict): Config of normalization layer.
-        base_channels (int): Out channels for conv_input layer.
-        output_channels (int): Out channels for conv_out layer.
-        encoder_channels (tuple[tuple[int]]):
-            Convolutional channels of each encode block.
-        encoder_paddings (tuple[tuple[int]]): Paddings of each encode block.
-        decoder_channels (tuple[tuple[int]]):
-            Convolutional channels of each decode block.
-        decoder_paddings (tuple[tuple[int]]): Paddings of each decode block.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 sparse_shape,
-                 order=('conv', 'norm', 'act'),
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 base_channels=16,
-                 output_channels=128,
-                 encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
-                                                                        64)),
-                 encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
-                                                                 1)),
-                 decoder_channels=((64, 64, 64), (64, 64, 32), (32, 32, 16),
-                                   (16, 16, 16)),
-                 decoder_paddings=((1, 0), (1, 0), (0, 0), (0, 1)),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.sparse_shape = sparse_shape
-        self.in_channels = in_channels
-        self.order = order
-        self.base_channels = base_channels
-        self.output_channels = output_channels
-        self.encoder_channels = encoder_channels
-        self.encoder_paddings = encoder_paddings
-        self.decoder_channels = decoder_channels
-        self.decoder_paddings = decoder_paddings
-        self.stage_num = len(self.encoder_channels)
-        self.fp16_enabled = False
-        # Spconv init all weight on its own
-
-        assert isinstance(order, tuple) and len(order) == 3
-        assert set(order) == {'conv', 'norm', 'act'}
-
-        if self.order[0] != 'conv':  # pre activate
-            self.conv_input = make_sparse_convmodule(
-                in_channels,
-                self.base_channels,
-                3,
-                norm_cfg=norm_cfg,
-                padding=1,
-                indice_key='subm1',
-                conv_type='SubMConv3d',
-                order=('conv', ))
-        else:  # post activate
-            self.conv_input = make_sparse_convmodule(
-                in_channels,
-                self.base_channels,
-                3,
-                norm_cfg=norm_cfg,
-                padding=1,
-                indice_key='subm1',
-                conv_type='SubMConv3d')
-
-        encoder_out_channels = self.make_encoder_layers(
-            make_sparse_convmodule, norm_cfg, self.base_channels)
-        self.make_decoder_layers(make_sparse_convmodule, norm_cfg,
-                                 encoder_out_channels)
-
-        self.conv_out = make_sparse_convmodule(
-            encoder_out_channels,
-            self.output_channels,
-            kernel_size=(3, 1, 1),
-            stride=(2, 1, 1),
-            norm_cfg=norm_cfg,
-            padding=0,
-            indice_key='spconv_down2',
-            conv_type='SparseConv3d')
-
-    @auto_fp16(apply_to=('voxel_features', ))
-    def forward(self, voxel_features, coors, batch_size):
-        """Forward of SparseUNet.
-
-        Args:
-            voxel_features (torch.float32): Voxel features in shape [N, C].
-            coors (torch.int32): Coordinates in shape [N, 4],
-                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
-            batch_size (int): Batch size.
-
-        Returns:
-            dict[str, torch.Tensor]: Backbone features.
-        """
-        coors = coors.int()
-        input_sp_tensor = SparseConvTensor(voxel_features, coors,
-                                           self.sparse_shape, batch_size)
-        x = self.conv_input(input_sp_tensor)
-
-        encode_features = []
-        for encoder_layer in self.encoder_layers:
-            x = encoder_layer(x)
-            encode_features.append(x)
-
-        # for detection head
-        # [200, 176, 5] -> [200, 176, 2]
-        out = self.conv_out(encode_features[-1])
-        spatial_features = out.dense()
-
-        N, C, D, H, W = spatial_features.shape
-        spatial_features = spatial_features.view(N, C * D, H, W)
-
-        # for segmentation head, with output shape:
-        # [400, 352, 11] <- [200, 176, 5]
-        # [800, 704, 21] <- [400, 352, 11]
-        # [1600, 1408, 41] <- [800, 704, 21]
-        # [1600, 1408, 41] <- [1600, 1408, 41]
-        decode_features = []
-        x = encode_features[-1]
-        for i in range(self.stage_num, 0, -1):
-            x = self.decoder_layer_forward(encode_features[i - 1], x,
-                                           getattr(self, f'lateral_layer{i}'),
-                                           getattr(self, f'merge_layer{i}'),
-                                           getattr(self, f'upsample_layer{i}'))
-            decode_features.append(x)
-
-        seg_features = decode_features[-1].features
-
-        ret = dict(
-            spatial_features=spatial_features, seg_features=seg_features)
-
-        return ret
-
-    def decoder_layer_forward(self, x_lateral, x_bottom, lateral_layer,
-                              merge_layer, upsample_layer):
-        """Forward of upsample and residual block.
-
-        Args:
-            x_lateral (:obj:`SparseConvTensor`): Lateral tensor.
-            x_bottom (:obj:`SparseConvTensor`): Feature from bottom layer.
-            lateral_layer (SparseBasicBlock): Convolution for lateral tensor.
-            merge_layer (SparseSequential): Convolution for merging features.
-            upsample_layer (SparseSequential): Convolution for upsampling.
-
-        Returns:
-            :obj:`SparseConvTensor`: Upsampled feature.
-        """
-        x = lateral_layer(x_lateral)
-        x = replace_feature(x, torch.cat((x_bottom.features, x.features),
-                                         dim=1))
-        x_merge = merge_layer(x)
-        x = self.reduce_channel(x, x_merge.features.shape[1])
-        x = replace_feature(x, x_merge.features + x.features)
-        x = upsample_layer(x)
-        return x
-
-    @staticmethod
-    def reduce_channel(x, out_channels):
-        """reduce channel for element-wise addition.
-
-        Args:
-            x (:obj:`SparseConvTensor`): Sparse tensor, ``x.features``
-                are in shape (N, C1).
-            out_channels (int): The number of channel after reduction.
-
-        Returns:
-            :obj:`SparseConvTensor`: Channel reduced feature.
-        """
-        features = x.features
-        n, in_channels = features.shape
-        assert (in_channels % out_channels
-                == 0) and (in_channels >= out_channels)
-        x = replace_feature(x, features.view(n, out_channels, -1).sum(dim=2))
-        return x
-
-    def make_encoder_layers(self, make_block, norm_cfg, in_channels):
-        """make encoder layers using sparse convs.
-
-        Args:
-            make_block (method): A bounded function to build blocks.
-            norm_cfg (dict[str]): Config of normalization layer.
-            in_channels (int): The number of encoder input channels.
-
-        Returns:
-            int: The number of encoder output channels.
-        """
-        self.encoder_layers = SparseSequential()
-
-        for i, blocks in enumerate(self.encoder_channels):
-            blocks_list = []
-            for j, out_channels in enumerate(tuple(blocks)):
-                padding = tuple(self.encoder_paddings[i])[j]
-                # each stage started with a spconv layer
-                # except the first stage
-                if i != 0 and j == 0:
-                    blocks_list.append(
-                        make_block(
-                            in_channels,
-                            out_channels,
-                            3,
-                            norm_cfg=norm_cfg,
-                            stride=2,
-                            padding=padding,
-                            indice_key=f'spconv{i + 1}',
-                            conv_type='SparseConv3d'))
-                else:
-                    blocks_list.append(
-                        make_block(
-                            in_channels,
-                            out_channels,
-                            3,
-                            norm_cfg=norm_cfg,
-                            padding=padding,
-                            indice_key=f'subm{i + 1}',
-                            conv_type='SubMConv3d'))
-                in_channels = out_channels
-            stage_name = f'encoder_layer{i + 1}'
-            stage_layers = SparseSequential(*blocks_list)
-            self.encoder_layers.add_module(stage_name, stage_layers)
-        return out_channels
-
-    def make_decoder_layers(self, make_block, norm_cfg, in_channels):
-        """make decoder layers using sparse convs.
-
-        Args:
-            make_block (method): A bounded function to build blocks.
-            norm_cfg (dict[str]): Config of normalization layer.
-            in_channels (int): The number of encoder input channels.
-
-        Returns:
-            int: The number of encoder output channels.
-        """
-        block_num = len(self.decoder_channels)
-        for i, block_channels in enumerate(self.decoder_channels):
-            paddings = self.decoder_paddings[i]
-            setattr(
-                self, f'lateral_layer{block_num - i}',
-                SparseBasicBlock(
-                    in_channels,
-                    block_channels[0],
-                    conv_cfg=dict(
-                        type='SubMConv3d', indice_key=f'subm{block_num - i}'),
-                    norm_cfg=norm_cfg))
-            setattr(
-                self, f'merge_layer{block_num - i}',
-                make_block(
-                    in_channels * 2,
-                    block_channels[1],
-                    3,
-                    norm_cfg=norm_cfg,
-                    padding=paddings[0],
-                    indice_key=f'subm{block_num - i}',
-                    conv_type='SubMConv3d'))
-            if block_num - i != 1:
-                setattr(
-                    self, f'upsample_layer{block_num - i}',
-                    make_block(
-                        in_channels,
-                        block_channels[2],
-                        3,
-                        norm_cfg=norm_cfg,
-                        indice_key=f'spconv{block_num - i}',
-                        conv_type='SparseInverseConv3d'))
-            else:
-                # use submanifold conv instead of inverse conv
-                # in the last block
-                setattr(
-                    self, f'upsample_layer{block_num - i}',
-                    make_block(
-                        in_channels,
-                        block_channels[2],
-                        3,
-                        norm_cfg=norm_cfg,
-                        padding=paddings[1],
-                        indice_key='subm1',
-                        conv_type='SubMConv3d'))
-            in_channels = block_channels[2]
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import SparseConvTensor, SparseSequential
+else:
+    from mmcv.ops import SparseConvTensor, SparseSequential
+
+from mmcv.runner import BaseModule, auto_fp16
+
+from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule
+from mmdet3d.ops.sparse_block import replace_feature
+from ..builder import MIDDLE_ENCODERS
+
+
+@MIDDLE_ENCODERS.register_module()
+class SparseUNet(BaseModule):
+    r"""SparseUNet for PartA^2.
+
+    See the `paper <https://arxiv.org/abs/1907.03670>`_ for more details.
+
+    Args:
+        in_channels (int): The number of input channels.
+        sparse_shape (list[int]): The sparse shape of input tensor.
+        norm_cfg (dict): Config of normalization layer.
+        base_channels (int): Out channels for conv_input layer.
+        output_channels (int): Out channels for conv_out layer.
+        encoder_channels (tuple[tuple[int]]):
+            Convolutional channels of each encode block.
+        encoder_paddings (tuple[tuple[int]]): Paddings of each encode block.
+        decoder_channels (tuple[tuple[int]]):
+            Convolutional channels of each decode block.
+        decoder_paddings (tuple[tuple[int]]): Paddings of each decode block.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 sparse_shape,
+                 order=('conv', 'norm', 'act'),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 base_channels=16,
+                 output_channels=128,
+                 encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
+                                                                        64)),
+                 encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
+                                                                 1)),
+                 decoder_channels=((64, 64, 64), (64, 64, 32), (32, 32, 16),
+                                   (16, 16, 16)),
+                 decoder_paddings=((1, 0), (1, 0), (0, 0), (0, 1)),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.sparse_shape = sparse_shape
+        self.in_channels = in_channels
+        self.order = order
+        self.base_channels = base_channels
+        self.output_channels = output_channels
+        self.encoder_channels = encoder_channels
+        self.encoder_paddings = encoder_paddings
+        self.decoder_channels = decoder_channels
+        self.decoder_paddings = decoder_paddings
+        self.stage_num = len(self.encoder_channels)
+        self.fp16_enabled = False
+        # Spconv init all weight on its own
+
+        assert isinstance(order, tuple) and len(order) == 3
+        assert set(order) == {'conv', 'norm', 'act'}
+
+        if self.order[0] != 'conv':  # pre activate
+            self.conv_input = make_sparse_convmodule(
+                in_channels,
+                self.base_channels,
+                3,
+                norm_cfg=norm_cfg,
+                padding=1,
+                indice_key='subm1',
+                conv_type='SubMConv3d',
+                order=('conv', ))
+        else:  # post activate
+            self.conv_input = make_sparse_convmodule(
+                in_channels,
+                self.base_channels,
+                3,
+                norm_cfg=norm_cfg,
+                padding=1,
+                indice_key='subm1',
+                conv_type='SubMConv3d')
+
+        encoder_out_channels = self.make_encoder_layers(
+            make_sparse_convmodule, norm_cfg, self.base_channels)
+        self.make_decoder_layers(make_sparse_convmodule, norm_cfg,
+                                 encoder_out_channels)
+
+        self.conv_out = make_sparse_convmodule(
+            encoder_out_channels,
+            self.output_channels,
+            kernel_size=(3, 1, 1),
+            stride=(2, 1, 1),
+            norm_cfg=norm_cfg,
+            padding=0,
+            indice_key='spconv_down2',
+            conv_type='SparseConv3d')
+
+    @auto_fp16(apply_to=('voxel_features', ))
+    def forward(self, voxel_features, coors, batch_size):
+        """Forward of SparseUNet.
+
+        Args:
+            voxel_features (torch.float32): Voxel features in shape [N, C].
+            coors (torch.int32): Coordinates in shape [N, 4],
+                the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
+            batch_size (int): Batch size.
+
+        Returns:
+            dict[str, torch.Tensor]: Backbone features.
+        """
+        coors = coors.int()
+        input_sp_tensor = SparseConvTensor(voxel_features, coors,
+                                           self.sparse_shape, batch_size)
+        x = self.conv_input(input_sp_tensor)
+
+        encode_features = []
+        for encoder_layer in self.encoder_layers:
+            x = encoder_layer(x)
+            encode_features.append(x)
+
+        # for detection head
+        # [200, 176, 5] -> [200, 176, 2]
+        out = self.conv_out(encode_features[-1])
+        spatial_features = out.dense()
+
+        N, C, D, H, W = spatial_features.shape
+        spatial_features = spatial_features.view(N, C * D, H, W)
+
+        # for segmentation head, with output shape:
+        # [400, 352, 11] <- [200, 176, 5]
+        # [800, 704, 21] <- [400, 352, 11]
+        # [1600, 1408, 41] <- [800, 704, 21]
+        # [1600, 1408, 41] <- [1600, 1408, 41]
+        decode_features = []
+        x = encode_features[-1]
+        for i in range(self.stage_num, 0, -1):
+            x = self.decoder_layer_forward(encode_features[i - 1], x,
+                                           getattr(self, f'lateral_layer{i}'),
+                                           getattr(self, f'merge_layer{i}'),
+                                           getattr(self, f'upsample_layer{i}'))
+            decode_features.append(x)
+
+        seg_features = decode_features[-1].features
+
+        ret = dict(
+            spatial_features=spatial_features, seg_features=seg_features)
+
+        return ret
+
+    def decoder_layer_forward(self, x_lateral, x_bottom, lateral_layer,
+                              merge_layer, upsample_layer):
+        """Forward of upsample and residual block.
+
+        Args:
+            x_lateral (:obj:`SparseConvTensor`): Lateral tensor.
+            x_bottom (:obj:`SparseConvTensor`): Feature from bottom layer.
+            lateral_layer (SparseBasicBlock): Convolution for lateral tensor.
+            merge_layer (SparseSequential): Convolution for merging features.
+            upsample_layer (SparseSequential): Convolution for upsampling.
+
+        Returns:
+            :obj:`SparseConvTensor`: Upsampled feature.
+        """
+        x = lateral_layer(x_lateral)
+        x = replace_feature(x, torch.cat((x_bottom.features, x.features),
+                                         dim=1))
+        x_merge = merge_layer(x)
+        x = self.reduce_channel(x, x_merge.features.shape[1])
+        x = replace_feature(x, x_merge.features + x.features)
+        x = upsample_layer(x)
+        return x
+
+    @staticmethod
+    def reduce_channel(x, out_channels):
+        """reduce channel for element-wise addition.
+
+        Args:
+            x (:obj:`SparseConvTensor`): Sparse tensor, ``x.features``
+                are in shape (N, C1).
+            out_channels (int): The number of channel after reduction.
+
+        Returns:
+            :obj:`SparseConvTensor`: Channel reduced feature.
+        """
+        features = x.features
+        n, in_channels = features.shape
+        assert (in_channels % out_channels
+                == 0) and (in_channels >= out_channels)
+        x = replace_feature(x, features.view(n, out_channels, -1).sum(dim=2))
+        return x
+
+    def make_encoder_layers(self, make_block, norm_cfg, in_channels):
+        """make encoder layers using sparse convs.
+
+        Args:
+            make_block (method): A bounded function to build blocks.
+            norm_cfg (dict[str]): Config of normalization layer.
+            in_channels (int): The number of encoder input channels.
+
+        Returns:
+            int: The number of encoder output channels.
+        """
+        self.encoder_layers = SparseSequential()
+
+        for i, blocks in enumerate(self.encoder_channels):
+            blocks_list = []
+            for j, out_channels in enumerate(tuple(blocks)):
+                padding = tuple(self.encoder_paddings[i])[j]
+                # each stage started with a spconv layer
+                # except the first stage
+                if i != 0 and j == 0:
+                    blocks_list.append(
+                        make_block(
+                            in_channels,
+                            out_channels,
+                            3,
+                            norm_cfg=norm_cfg,
+                            stride=2,
+                            padding=padding,
+                            indice_key=f'spconv{i + 1}',
+                            conv_type='SparseConv3d'))
+                else:
+                    blocks_list.append(
+                        make_block(
+                            in_channels,
+                            out_channels,
+                            3,
+                            norm_cfg=norm_cfg,
+                            padding=padding,
+                            indice_key=f'subm{i + 1}',
+                            conv_type='SubMConv3d'))
+                in_channels = out_channels
+            stage_name = f'encoder_layer{i + 1}'
+            stage_layers = SparseSequential(*blocks_list)
+            self.encoder_layers.add_module(stage_name, stage_layers)
+        return out_channels
+
+    def make_decoder_layers(self, make_block, norm_cfg, in_channels):
+        """make decoder layers using sparse convs.
+
+        Args:
+            make_block (method): A bounded function to build blocks.
+            norm_cfg (dict[str]): Config of normalization layer.
+            in_channels (int): The number of encoder input channels.
+
+        Returns:
+            int: The number of encoder output channels.
+        """
+        block_num = len(self.decoder_channels)
+        for i, block_channels in enumerate(self.decoder_channels):
+            paddings = self.decoder_paddings[i]
+            setattr(
+                self, f'lateral_layer{block_num - i}',
+                SparseBasicBlock(
+                    in_channels,
+                    block_channels[0],
+                    conv_cfg=dict(
+                        type='SubMConv3d', indice_key=f'subm{block_num - i}'),
+                    norm_cfg=norm_cfg))
+            setattr(
+                self, f'merge_layer{block_num - i}',
+                make_block(
+                    in_channels * 2,
+                    block_channels[1],
+                    3,
+                    norm_cfg=norm_cfg,
+                    padding=paddings[0],
+                    indice_key=f'subm{block_num - i}',
+                    conv_type='SubMConv3d'))
+            if block_num - i != 1:
+                setattr(
+                    self, f'upsample_layer{block_num - i}',
+                    make_block(
+                        in_channels,
+                        block_channels[2],
+                        3,
+                        norm_cfg=norm_cfg,
+                        indice_key=f'spconv{block_num - i}',
+                        conv_type='SparseInverseConv3d'))
+            else:
+                # use submanifold conv instead of inverse conv
+                # in the last block
+                setattr(
+                    self, f'upsample_layer{block_num - i}',
+                    make_block(
+                        in_channels,
+                        block_channels[2],
+                        3,
+                        norm_cfg=norm_cfg,
+                        padding=paddings[1],
+                        indice_key='subm1',
+                        conv_type='SubMConv3d'))
+            in_channels = block_channels[2]
diff --git a/mmdet3d/models/model_utils/__init__.py b/mmdet3d/models/model_utils/__init__.py
index 34df79a..f86d069 100644
--- a/mmdet3d/models/model_utils/__init__.py
+++ b/mmdet3d/models/model_utils/__init__.py
@@ -1,6 +1,6 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .edge_fusion_module import EdgeFusionModule
-from .transformer import GroupFree3DMHA
-from .vote_module import VoteModule
-
-__all__ = ['VoteModule', 'GroupFree3DMHA', 'EdgeFusionModule']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .edge_fusion_module import EdgeFusionModule
+from .transformer import GroupFree3DMHA
+from .vote_module import VoteModule
+
+__all__ = ['VoteModule', 'GroupFree3DMHA', 'EdgeFusionModule']
diff --git a/mmdet3d/models/model_utils/edge_fusion_module.py b/mmdet3d/models/model_utils/edge_fusion_module.py
index 2d9e09e..b51bd48 100644
--- a/mmdet3d/models/model_utils/edge_fusion_module.py
+++ b/mmdet3d/models/model_utils/edge_fusion_module.py
@@ -1,78 +1,78 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule
-from torch import nn as nn
-from torch.nn import functional as F
-
-
-class EdgeFusionModule(BaseModule):
-    """Edge Fusion Module for feature map.
-
-    Args:
-        out_channels (int): The number of output channels.
-        feat_channels (int): The number of channels in feature map
-            during edge feature fusion.
-        kernel_size (int, optional): Kernel size of convolution.
-            Default: 3.
-        act_cfg (dict, optional): Config of activation.
-            Default: dict(type='ReLU').
-        norm_cfg (dict, optional): Config of normalization.
-            Default: dict(type='BN1d')).
-    """
-
-    def __init__(self,
-                 out_channels,
-                 feat_channels,
-                 kernel_size=3,
-                 act_cfg=dict(type='ReLU'),
-                 norm_cfg=dict(type='BN1d')):
-        super().__init__()
-        self.edge_convs = nn.Sequential(
-            ConvModule(
-                feat_channels,
-                feat_channels,
-                kernel_size=kernel_size,
-                padding=kernel_size // 2,
-                conv_cfg=dict(type='Conv1d'),
-                norm_cfg=norm_cfg,
-                act_cfg=act_cfg),
-            nn.Conv1d(feat_channels, out_channels, kernel_size=1))
-        self.feat_channels = feat_channels
-
-    def forward(self, features, fused_features, edge_indices, edge_lens,
-                output_h, output_w):
-        """Forward pass.
-
-        Args:
-            features (torch.Tensor): Different representative features
-                for fusion.
-            fused_features (torch.Tensor): Different representative
-                features to be fused.
-            edge_indices (torch.Tensor): Batch image edge indices.
-            edge_lens (list[int]): List of edge length of each image.
-            output_h (int): Height of output feature map.
-            output_w (int): Width of output feature map.
-
-        Returns:
-            torch.Tensor: Fused feature maps.
-        """
-        batch_size = features.shape[0]
-        # normalize
-        grid_edge_indices = edge_indices.view(batch_size, -1, 1, 2).float()
-        grid_edge_indices[..., 0] = \
-            grid_edge_indices[..., 0] / (output_w - 1) * 2 - 1
-        grid_edge_indices[..., 1] = \
-            grid_edge_indices[..., 1] / (output_h - 1) * 2 - 1
-
-        # apply edge fusion
-        edge_features = F.grid_sample(
-            features, grid_edge_indices, align_corners=True).squeeze(-1)
-        edge_output = self.edge_convs(edge_features)
-
-        for k in range(batch_size):
-            edge_indice_k = edge_indices[k, :edge_lens[k]]
-            fused_features[k, :, edge_indice_k[:, 1],
-                           edge_indice_k[:, 0]] += edge_output[
-                               k, :, :edge_lens[k]]
-
-        return fused_features
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+class EdgeFusionModule(BaseModule):
+    """Edge Fusion Module for feature map.
+
+    Args:
+        out_channels (int): The number of output channels.
+        feat_channels (int): The number of channels in feature map
+            during edge feature fusion.
+        kernel_size (int, optional): Kernel size of convolution.
+            Default: 3.
+        act_cfg (dict, optional): Config of activation.
+            Default: dict(type='ReLU').
+        norm_cfg (dict, optional): Config of normalization.
+            Default: dict(type='BN1d')).
+    """
+
+    def __init__(self,
+                 out_channels,
+                 feat_channels,
+                 kernel_size=3,
+                 act_cfg=dict(type='ReLU'),
+                 norm_cfg=dict(type='BN1d')):
+        super().__init__()
+        self.edge_convs = nn.Sequential(
+            ConvModule(
+                feat_channels,
+                feat_channels,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg),
+            nn.Conv1d(feat_channels, out_channels, kernel_size=1))
+        self.feat_channels = feat_channels
+
+    def forward(self, features, fused_features, edge_indices, edge_lens,
+                output_h, output_w):
+        """Forward pass.
+
+        Args:
+            features (torch.Tensor): Different representative features
+                for fusion.
+            fused_features (torch.Tensor): Different representative
+                features to be fused.
+            edge_indices (torch.Tensor): Batch image edge indices.
+            edge_lens (list[int]): List of edge length of each image.
+            output_h (int): Height of output feature map.
+            output_w (int): Width of output feature map.
+
+        Returns:
+            torch.Tensor: Fused feature maps.
+        """
+        batch_size = features.shape[0]
+        # normalize
+        grid_edge_indices = edge_indices.view(batch_size, -1, 1, 2).float()
+        grid_edge_indices[..., 0] = \
+            grid_edge_indices[..., 0] / (output_w - 1) * 2 - 1
+        grid_edge_indices[..., 1] = \
+            grid_edge_indices[..., 1] / (output_h - 1) * 2 - 1
+
+        # apply edge fusion
+        edge_features = F.grid_sample(
+            features, grid_edge_indices, align_corners=True).squeeze(-1)
+        edge_output = self.edge_convs(edge_features)
+
+        for k in range(batch_size):
+            edge_indice_k = edge_indices[k, :edge_lens[k]]
+            fused_features[k, :, edge_indice_k[:, 1],
+                           edge_indice_k[:, 0]] += edge_output[
+                               k, :, :edge_lens[k]]
+
+        return fused_features
diff --git a/mmdet3d/models/model_utils/transformer.py b/mmdet3d/models/model_utils/transformer.py
index 4f9a833..a823873 100644
--- a/mmdet3d/models/model_utils/transformer.py
+++ b/mmdet3d/models/model_utils/transformer.py
@@ -1,139 +1,139 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn.bricks.registry import ATTENTION
-from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING, MultiheadAttention
-from torch import nn as nn
-
-
-@ATTENTION.register_module()
-class GroupFree3DMHA(MultiheadAttention):
-    """A warpper for torch.nn.MultiheadAttention for GroupFree3D.
-
-    This module implements MultiheadAttention with identity connection,
-    and positional encoding used in DETR is also passed as input.
-
-    Args:
-        embed_dims (int): The embedding dimension.
-        num_heads (int): Parallel attention heads. Same as
-            `nn.MultiheadAttention`.
-        attn_drop (float, optional): A Dropout layer on attn_output_weights.
-            Defaults to 0.0.
-        proj_drop (float, optional): A Dropout layer. Defaults to 0.0.
-        dropout_layer (obj:`ConfigDict`, optional): The dropout_layer used
-            when adding the shortcut.
-        init_cfg (obj:`mmcv.ConfigDict`, optional): The Config for
-            initialization. Default: None.
-        batch_first (bool, optional): Key, Query and Value are shape of
-            (batch, n, embed_dim)
-            or (n, batch, embed_dim). Defaults to False.
-    """
-
-    def __init__(self,
-                 embed_dims,
-                 num_heads,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 dropout_layer=dict(type='DropOut', drop_prob=0.),
-                 init_cfg=None,
-                 batch_first=False,
-                 **kwargs):
-        super().__init__(embed_dims, num_heads, attn_drop, proj_drop,
-                         dropout_layer, init_cfg, batch_first, **kwargs)
-
-    def forward(self,
-                query,
-                key,
-                value,
-                identity,
-                query_pos=None,
-                key_pos=None,
-                attn_mask=None,
-                key_padding_mask=None,
-                **kwargs):
-        """Forward function for `GroupFree3DMHA`.
-
-        **kwargs allow passing a more general data flow when combining
-        with other operations in `transformerlayer`.
-
-        Args:
-            query (Tensor): The input query with shape [num_queries, bs,
-                embed_dims]. Same in `nn.MultiheadAttention.forward`.
-            key (Tensor): The key tensor with shape [num_keys, bs,
-                embed_dims]. Same in `nn.MultiheadAttention.forward`.
-                If None, the ``query`` will be used.
-            value (Tensor): The value tensor with same shape as `key`.
-                Same in `nn.MultiheadAttention.forward`.
-                If None, the `key` will be used.
-            identity (Tensor): This tensor, with the same shape as x,
-                will be used for the identity link. If None, `x` will be used.
-            query_pos (Tensor, optional): The positional encoding for query,
-                with the same shape as `x`. Defaults to None.
-                If not None, it will be added to `x` before forward function.
-            key_pos (Tensor, optional): The positional encoding for `key`,
-                with the same shape as `key`. Defaults to None. If not None,
-                it will be added to `key` before forward function. If None,
-                and `query_pos` has the same shape as `key`, then `query_pos`
-                will be used for `key_pos`. Defaults to None.
-            attn_mask (Tensor, optional): ByteTensor mask with shape
-                [num_queries, num_keys].
-                Same in `nn.MultiheadAttention.forward`. Defaults to None.
-            key_padding_mask (Tensor, optional): ByteTensor with shape
-                [bs, num_keys]. Same in `nn.MultiheadAttention.forward`.
-                Defaults to None.
-
-        Returns:
-            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
-        """
-
-        if hasattr(self, 'operation_name'):
-            if self.operation_name == 'self_attn':
-                value = value + query_pos
-            elif self.operation_name == 'cross_attn':
-                value = value + key_pos
-            else:
-                raise NotImplementedError(
-                    f'{self.__class__.name} '
-                    f"can't be used as {self.operation_name}")
-        else:
-            value = value + query_pos
-
-        return super(GroupFree3DMHA, self).forward(
-            query=query,
-            key=key,
-            value=value,
-            identity=identity,
-            query_pos=query_pos,
-            key_pos=key_pos,
-            attn_mask=attn_mask,
-            key_padding_mask=key_padding_mask,
-            **kwargs)
-
-
-@POSITIONAL_ENCODING.register_module()
-class ConvBNPositionalEncoding(nn.Module):
-    """Absolute position embedding with Conv learning.
-
-    Args:
-        input_channel (int): input features dim.
-        num_pos_feats (int, optional): output position features dim.
-            Defaults to 288 to be consistent with seed features dim.
-    """
-
-    def __init__(self, input_channel, num_pos_feats=288):
-        super().__init__()
-        self.position_embedding_head = nn.Sequential(
-            nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
-            nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True),
-            nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))
-
-    def forward(self, xyz):
-        """Forward pass.
-
-        Args:
-            xyz (Tensor)： (B, N, 3) the coordinates to embed.
-
-        Returns:
-            Tensor: (B, num_pos_feats, N) the embedded position features.
-        """
-        xyz = xyz.permute(0, 2, 1)
-        position_embedding = self.position_embedding_head(xyz)
-        return position_embedding
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn.bricks.registry import ATTENTION
+from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING, MultiheadAttention
+from torch import nn as nn
+
+
+@ATTENTION.register_module()
+class GroupFree3DMHA(MultiheadAttention):
+    """A warpper for torch.nn.MultiheadAttention for GroupFree3D.
+
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding used in DETR is also passed as input.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads. Same as
+            `nn.MultiheadAttention`.
+        attn_drop (float, optional): A Dropout layer on attn_output_weights.
+            Defaults to 0.0.
+        proj_drop (float, optional): A Dropout layer. Defaults to 0.0.
+        dropout_layer (obj:`ConfigDict`, optional): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`, optional): The Config for
+            initialization. Default: None.
+        batch_first (bool, optional): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Defaults to False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='DropOut', drop_prob=0.),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+        super().__init__(embed_dims, num_heads, attn_drop, proj_drop,
+                         dropout_layer, init_cfg, batch_first, **kwargs)
+
+    def forward(self,
+                query,
+                key,
+                value,
+                identity,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `GroupFree3DMHA`.
+
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims]. Same in `nn.MultiheadAttention.forward`.
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims]. Same in `nn.MultiheadAttention.forward`.
+                If None, the ``query`` will be used.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link. If None, `x` will be used.
+            query_pos (Tensor, optional): The positional encoding for query,
+                with the same shape as `x`. Defaults to None.
+                If not None, it will be added to `x` before forward function.
+            key_pos (Tensor, optional): The positional encoding for `key`,
+                with the same shape as `key`. Defaults to None. If not None,
+                it will be added to `key` before forward function. If None,
+                and `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor, optional): ByteTensor mask with shape
+                [num_queries, num_keys].
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+            key_padding_mask (Tensor, optional): ByteTensor with shape
+                [bs, num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        if hasattr(self, 'operation_name'):
+            if self.operation_name == 'self_attn':
+                value = value + query_pos
+            elif self.operation_name == 'cross_attn':
+                value = value + key_pos
+            else:
+                raise NotImplementedError(
+                    f'{self.__class__.name} '
+                    f"can't be used as {self.operation_name}")
+        else:
+            value = value + query_pos
+
+        return super(GroupFree3DMHA, self).forward(
+            query=query,
+            key=key,
+            value=value,
+            identity=identity,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+
+
+@POSITIONAL_ENCODING.register_module()
+class ConvBNPositionalEncoding(nn.Module):
+    """Absolute position embedding with Conv learning.
+
+    Args:
+        input_channel (int): input features dim.
+        num_pos_feats (int, optional): output position features dim.
+            Defaults to 288 to be consistent with seed features dim.
+    """
+
+    def __init__(self, input_channel, num_pos_feats=288):
+        super().__init__()
+        self.position_embedding_head = nn.Sequential(
+            nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
+            nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True),
+            nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))
+
+    def forward(self, xyz):
+        """Forward pass.
+
+        Args:
+            xyz (Tensor)： (B, N, 3) the coordinates to embed.
+
+        Returns:
+            Tensor: (B, num_pos_feats, N) the embedded position features.
+        """
+        xyz = xyz.permute(0, 2, 1)
+        position_embedding = self.position_embedding_head(xyz)
+        return position_embedding
diff --git a/mmdet3d/models/model_utils/vote_module.py b/mmdet3d/models/model_utils/vote_module.py
index 5cc52ad..fc5b881 100644
--- a/mmdet3d/models/model_utils/vote_module.py
+++ b/mmdet3d/models/model_utils/vote_module.py
@@ -1,184 +1,184 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv import is_tuple_of
-from mmcv.cnn import ConvModule
-from torch import nn as nn
-
-from mmdet3d.models.builder import build_loss
-
-
-class VoteModule(nn.Module):
-    """Vote module.
-
-    Generate votes from seed point features.
-
-    Args:
-        in_channels (int): Number of channels of seed point features.
-        vote_per_seed (int, optional): Number of votes generated from
-            each seed point. Default: 1.
-        gt_per_seed (int, optional): Number of ground truth votes generated
-            from each seed point. Default: 3.
-        num_points (int, optional): Number of points to be used for voting.
-            Default: 1.
-        conv_channels (tuple[int], optional): Out channels of vote
-            generating convolution. Default: (16, 16).
-        conv_cfg (dict, optional): Config of convolution.
-            Default: dict(type='Conv1d').
-        norm_cfg (dict, optional): Config of normalization.
-            Default: dict(type='BN1d').
-        norm_feats (bool, optional): Whether to normalize features.
-            Default: True.
-        with_res_feat (bool, optional): Whether to predict residual features.
-            Default: True.
-        vote_xyz_range (list[float], optional):
-            The range of points translation. Default: None.
-        vote_loss (dict, optional): Config of vote loss. Default: None.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 vote_per_seed=1,
-                 gt_per_seed=3,
-                 num_points=-1,
-                 conv_channels=(16, 16),
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 norm_feats=True,
-                 with_res_feat=True,
-                 vote_xyz_range=None,
-                 vote_loss=None):
-        super().__init__()
-        self.in_channels = in_channels
-        self.vote_per_seed = vote_per_seed
-        self.gt_per_seed = gt_per_seed
-        self.num_points = num_points
-        self.norm_feats = norm_feats
-        self.with_res_feat = with_res_feat
-
-        assert vote_xyz_range is None or is_tuple_of(vote_xyz_range, float)
-        self.vote_xyz_range = vote_xyz_range
-
-        if vote_loss is not None:
-            self.vote_loss = build_loss(vote_loss)
-
-        prev_channels = in_channels
-        vote_conv_list = list()
-        for k in range(len(conv_channels)):
-            vote_conv_list.append(
-                ConvModule(
-                    prev_channels,
-                    conv_channels[k],
-                    1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
-                    bias=True,
-                    inplace=True))
-            prev_channels = conv_channels[k]
-        self.vote_conv = nn.Sequential(*vote_conv_list)
-
-        # conv_out predicts coordinate and residual features
-        if with_res_feat:
-            out_channel = (3 + in_channels) * self.vote_per_seed
-        else:
-            out_channel = 3 * self.vote_per_seed
-        self.conv_out = nn.Conv1d(prev_channels, out_channel, 1)
-
-    def forward(self, seed_points, seed_feats):
-        """forward.
-
-        Args:
-            seed_points (torch.Tensor): Coordinate of the seed
-                points in shape (B, N, 3).
-            seed_feats (torch.Tensor): Features of the seed points in shape
-                (B, C, N).
-
-        Returns:
-            tuple[torch.Tensor]:
-
-                - vote_points: Voted xyz based on the seed points
-                    with shape (B, M, 3), ``M=num_seed*vote_per_seed``.
-                - vote_features: Voted features based on the seed points with
-                    shape (B, C, M) where ``M=num_seed*vote_per_seed``,
-                    ``C=vote_feature_dim``.
-        """
-        if self.num_points != -1:
-            assert self.num_points < seed_points.shape[1], \
-                f'Number of vote points ({self.num_points}) should be '\
-                f'smaller than seed points size ({seed_points.shape[1]})'
-            seed_points = seed_points[:, :self.num_points]
-            seed_feats = seed_feats[..., :self.num_points]
-
-        batch_size, feat_channels, num_seed = seed_feats.shape
-        num_vote = num_seed * self.vote_per_seed
-        x = self.vote_conv(seed_feats)
-        # (batch_size, (3+out_dim)*vote_per_seed, num_seed)
-        votes = self.conv_out(x)
-
-        votes = votes.transpose(2, 1).view(batch_size, num_seed,
-                                           self.vote_per_seed, -1)
-
-        offset = votes[:, :, :, 0:3]
-        if self.vote_xyz_range is not None:
-            limited_offset_list = []
-            for axis in range(len(self.vote_xyz_range)):
-                limited_offset_list.append(offset[..., axis].clamp(
-                    min=-self.vote_xyz_range[axis],
-                    max=self.vote_xyz_range[axis]))
-            limited_offset = torch.stack(limited_offset_list, -1)
-            vote_points = (seed_points.unsqueeze(2) +
-                           limited_offset).contiguous()
-        else:
-            vote_points = (seed_points.unsqueeze(2) + offset).contiguous()
-        vote_points = vote_points.view(batch_size, num_vote, 3)
-        offset = offset.reshape(batch_size, num_vote, 3).transpose(2, 1)
-
-        if self.with_res_feat:
-            res_feats = votes[:, :, :, 3:]
-            vote_feats = (seed_feats.transpose(2, 1).unsqueeze(2) +
-                          res_feats).contiguous()
-            vote_feats = vote_feats.view(batch_size,
-                                         num_vote, feat_channels).transpose(
-                                             2, 1).contiguous()
-
-            if self.norm_feats:
-                features_norm = torch.norm(vote_feats, p=2, dim=1)
-                vote_feats = vote_feats.div(features_norm.unsqueeze(1))
-        else:
-            vote_feats = seed_feats
-        return vote_points, vote_feats, offset
-
-    def get_loss(self, seed_points, vote_points, seed_indices,
-                 vote_targets_mask, vote_targets):
-        """Calculate loss of voting module.
-
-        Args:
-            seed_points (torch.Tensor): Coordinate of the seed points.
-            vote_points (torch.Tensor): Coordinate of the vote points.
-            seed_indices (torch.Tensor): Indices of seed points in raw points.
-            vote_targets_mask (torch.Tensor): Mask of valid vote targets.
-            vote_targets (torch.Tensor): Targets of votes.
-
-        Returns:
-            torch.Tensor: Weighted vote loss.
-        """
-        batch_size, num_seed = seed_points.shape[:2]
-
-        seed_gt_votes_mask = torch.gather(vote_targets_mask, 1,
-                                          seed_indices).float()
-
-        seed_indices_expand = seed_indices.unsqueeze(-1).repeat(
-            1, 1, 3 * self.gt_per_seed)
-        seed_gt_votes = torch.gather(vote_targets, 1, seed_indices_expand)
-        seed_gt_votes += seed_points.repeat(1, 1, self.gt_per_seed)
-
-        weight = seed_gt_votes_mask / (torch.sum(seed_gt_votes_mask) + 1e-6)
-        distance = self.vote_loss(
-            vote_points.view(batch_size * num_seed, -1, 3),
-            seed_gt_votes.view(batch_size * num_seed, -1, 3),
-            dst_weight=weight.view(batch_size * num_seed, 1))[1]
-        vote_loss = torch.sum(torch.min(distance, dim=1)[0])
-
-        return vote_loss
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv import is_tuple_of
+from mmcv.cnn import ConvModule
+from torch import nn as nn
+
+from mmdet3d.models.builder import build_loss
+
+
+class VoteModule(nn.Module):
+    """Vote module.
+
+    Generate votes from seed point features.
+
+    Args:
+        in_channels (int): Number of channels of seed point features.
+        vote_per_seed (int, optional): Number of votes generated from
+            each seed point. Default: 1.
+        gt_per_seed (int, optional): Number of ground truth votes generated
+            from each seed point. Default: 3.
+        num_points (int, optional): Number of points to be used for voting.
+            Default: 1.
+        conv_channels (tuple[int], optional): Out channels of vote
+            generating convolution. Default: (16, 16).
+        conv_cfg (dict, optional): Config of convolution.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict, optional): Config of normalization.
+            Default: dict(type='BN1d').
+        norm_feats (bool, optional): Whether to normalize features.
+            Default: True.
+        with_res_feat (bool, optional): Whether to predict residual features.
+            Default: True.
+        vote_xyz_range (list[float], optional):
+            The range of points translation. Default: None.
+        vote_loss (dict, optional): Config of vote loss. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 vote_per_seed=1,
+                 gt_per_seed=3,
+                 num_points=-1,
+                 conv_channels=(16, 16),
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 act_cfg=dict(type='ReLU'),
+                 norm_feats=True,
+                 with_res_feat=True,
+                 vote_xyz_range=None,
+                 vote_loss=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.vote_per_seed = vote_per_seed
+        self.gt_per_seed = gt_per_seed
+        self.num_points = num_points
+        self.norm_feats = norm_feats
+        self.with_res_feat = with_res_feat
+
+        assert vote_xyz_range is None or is_tuple_of(vote_xyz_range, float)
+        self.vote_xyz_range = vote_xyz_range
+
+        if vote_loss is not None:
+            self.vote_loss = build_loss(vote_loss)
+
+        prev_channels = in_channels
+        vote_conv_list = list()
+        for k in range(len(conv_channels)):
+            vote_conv_list.append(
+                ConvModule(
+                    prev_channels,
+                    conv_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    bias=True,
+                    inplace=True))
+            prev_channels = conv_channels[k]
+        self.vote_conv = nn.Sequential(*vote_conv_list)
+
+        # conv_out predicts coordinate and residual features
+        if with_res_feat:
+            out_channel = (3 + in_channels) * self.vote_per_seed
+        else:
+            out_channel = 3 * self.vote_per_seed
+        self.conv_out = nn.Conv1d(prev_channels, out_channel, 1)
+
+    def forward(self, seed_points, seed_feats):
+        """forward.
+
+        Args:
+            seed_points (torch.Tensor): Coordinate of the seed
+                points in shape (B, N, 3).
+            seed_feats (torch.Tensor): Features of the seed points in shape
+                (B, C, N).
+
+        Returns:
+            tuple[torch.Tensor]:
+
+                - vote_points: Voted xyz based on the seed points
+                    with shape (B, M, 3), ``M=num_seed*vote_per_seed``.
+                - vote_features: Voted features based on the seed points with
+                    shape (B, C, M) where ``M=num_seed*vote_per_seed``,
+                    ``C=vote_feature_dim``.
+        """
+        if self.num_points != -1:
+            assert self.num_points < seed_points.shape[1], \
+                f'Number of vote points ({self.num_points}) should be '\
+                f'smaller than seed points size ({seed_points.shape[1]})'
+            seed_points = seed_points[:, :self.num_points]
+            seed_feats = seed_feats[..., :self.num_points]
+
+        batch_size, feat_channels, num_seed = seed_feats.shape
+        num_vote = num_seed * self.vote_per_seed
+        x = self.vote_conv(seed_feats)
+        # (batch_size, (3+out_dim)*vote_per_seed, num_seed)
+        votes = self.conv_out(x)
+
+        votes = votes.transpose(2, 1).view(batch_size, num_seed,
+                                           self.vote_per_seed, -1)
+
+        offset = votes[:, :, :, 0:3]
+        if self.vote_xyz_range is not None:
+            limited_offset_list = []
+            for axis in range(len(self.vote_xyz_range)):
+                limited_offset_list.append(offset[..., axis].clamp(
+                    min=-self.vote_xyz_range[axis],
+                    max=self.vote_xyz_range[axis]))
+            limited_offset = torch.stack(limited_offset_list, -1)
+            vote_points = (seed_points.unsqueeze(2) +
+                           limited_offset).contiguous()
+        else:
+            vote_points = (seed_points.unsqueeze(2) + offset).contiguous()
+        vote_points = vote_points.view(batch_size, num_vote, 3)
+        offset = offset.reshape(batch_size, num_vote, 3).transpose(2, 1)
+
+        if self.with_res_feat:
+            res_feats = votes[:, :, :, 3:]
+            vote_feats = (seed_feats.transpose(2, 1).unsqueeze(2) +
+                          res_feats).contiguous()
+            vote_feats = vote_feats.view(batch_size,
+                                         num_vote, feat_channels).transpose(
+                                             2, 1).contiguous()
+
+            if self.norm_feats:
+                features_norm = torch.norm(vote_feats, p=2, dim=1)
+                vote_feats = vote_feats.div(features_norm.unsqueeze(1))
+        else:
+            vote_feats = seed_feats
+        return vote_points, vote_feats, offset
+
+    def get_loss(self, seed_points, vote_points, seed_indices,
+                 vote_targets_mask, vote_targets):
+        """Calculate loss of voting module.
+
+        Args:
+            seed_points (torch.Tensor): Coordinate of the seed points.
+            vote_points (torch.Tensor): Coordinate of the vote points.
+            seed_indices (torch.Tensor): Indices of seed points in raw points.
+            vote_targets_mask (torch.Tensor): Mask of valid vote targets.
+            vote_targets (torch.Tensor): Targets of votes.
+
+        Returns:
+            torch.Tensor: Weighted vote loss.
+        """
+        batch_size, num_seed = seed_points.shape[:2]
+
+        seed_gt_votes_mask = torch.gather(vote_targets_mask, 1,
+                                          seed_indices).float()
+
+        seed_indices_expand = seed_indices.unsqueeze(-1).repeat(
+            1, 1, 3 * self.gt_per_seed)
+        seed_gt_votes = torch.gather(vote_targets, 1, seed_indices_expand)
+        seed_gt_votes += seed_points.repeat(1, 1, self.gt_per_seed)
+
+        weight = seed_gt_votes_mask / (torch.sum(seed_gt_votes_mask) + 1e-6)
+        distance = self.vote_loss(
+            vote_points.view(batch_size * num_seed, -1, 3),
+            seed_gt_votes.view(batch_size * num_seed, -1, 3),
+            dst_weight=weight.view(batch_size * num_seed, 1))[1]
+        vote_loss = torch.sum(torch.min(distance, dim=1)[0])
+
+        return vote_loss
diff --git a/mmdet3d/models/necks/__init__.py b/mmdet3d/models/necks/__init__.py
index 5443d35..4c8c674 100644
--- a/mmdet3d/models/necks/__init__.py
+++ b/mmdet3d/models/necks/__init__.py
@@ -1,10 +1,10 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.models.necks.fpn import FPN
-from .dla_neck import DLANeck
-from .imvoxel_neck import OutdoorImVoxelNeck
-from .pointnet2_fp_neck import PointNetFPNeck
-from .second_fpn import SECONDFPN
-
-__all__ = [
-    'FPN', 'SECONDFPN', 'OutdoorImVoxelNeck', 'PointNetFPNeck', 'DLANeck'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.necks.fpn import FPN
+from .dla_neck import DLANeck
+from .imvoxel_neck import OutdoorImVoxelNeck
+from .pointnet2_fp_neck import PointNetFPNeck
+from .second_fpn import SECONDFPN
+
+__all__ = [
+    'FPN', 'SECONDFPN', 'OutdoorImVoxelNeck', 'PointNetFPNeck', 'DLANeck'
+]
diff --git a/mmdet3d/models/necks/dla_neck.py b/mmdet3d/models/necks/dla_neck.py
index c32e8bb..503fcba 100644
--- a/mmdet3d/models/necks/dla_neck.py
+++ b/mmdet3d/models/necks/dla_neck.py
@@ -1,233 +1,233 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import math
-
-import numpy as np
-from mmcv.cnn import ConvModule, build_conv_layer
-from mmcv.runner import BaseModule
-from torch import nn as nn
-
-from ..builder import NECKS
-
-
-def fill_up_weights(up):
-    """Simulated bilinear upsampling kernel.
-
-    Args:
-        up (nn.Module): ConvTranspose2d module.
-    """
-    w = up.weight.data
-    f = math.ceil(w.size(2) / 2)
-    c = (2 * f - 1 - f % 2) / (2. * f)
-    for i in range(w.size(2)):
-        for j in range(w.size(3)):
-            w[0, 0, i, j] = \
-                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
-    for c in range(1, w.size(0)):
-        w[c, 0, :, :] = w[0, 0, :, :]
-
-
-class IDAUpsample(BaseModule):
-    """Iterative Deep Aggregation (IDA) Upsampling module to upsample features
-    of different scales to a similar scale.
-
-    Args:
-        out_channels (int): Number of output channels for DeformConv.
-        in_channels (List[int]): List of input channels of multi-scale
-            feature maps.
-        kernel_sizes (List[int]): List of size of the convolving
-            kernel of different scales.
-        norm_cfg (dict, optional): Config dict for normalization layer.
-            Default: None.
-        use_dcn (bool, optional): If True, use DCNv2. Default: True.
-    """
-
-    def __init__(
-        self,
-        out_channels,
-        in_channels,
-        kernel_sizes,
-        norm_cfg=None,
-        use_dcn=True,
-        init_cfg=None,
-    ):
-        super(IDAUpsample, self).__init__(init_cfg)
-        self.use_dcn = use_dcn
-        self.projs = nn.ModuleList()
-        self.ups = nn.ModuleList()
-        self.nodes = nn.ModuleList()
-
-        for i in range(1, len(in_channels)):
-            in_channel = in_channels[i]
-            up_kernel_size = int(kernel_sizes[i])
-            proj = ConvModule(
-                in_channel,
-                out_channels,
-                3,
-                padding=1,
-                bias=True,
-                conv_cfg=dict(type='DCNv2') if self.use_dcn else None,
-                norm_cfg=norm_cfg)
-            node = ConvModule(
-                out_channels,
-                out_channels,
-                3,
-                padding=1,
-                bias=True,
-                conv_cfg=dict(type='DCNv2') if self.use_dcn else None,
-                norm_cfg=norm_cfg)
-            up = build_conv_layer(
-                dict(type='deconv'),
-                out_channels,
-                out_channels,
-                up_kernel_size * 2,
-                stride=up_kernel_size,
-                padding=up_kernel_size // 2,
-                output_padding=0,
-                groups=out_channels,
-                bias=False)
-
-            self.projs.append(proj)
-            self.ups.append(up)
-            self.nodes.append(node)
-
-    def forward(self, mlvl_features, start_level, end_level):
-        """Forward function.
-
-        Args:
-            mlvl_features (list[torch.Tensor]): Features from multiple layers.
-            start_level (int): Start layer for feature upsampling.
-            end_level (int): End layer for feature upsampling.
-        """
-        for i in range(start_level, end_level - 1):
-            upsample = self.ups[i - start_level]
-            project = self.projs[i - start_level]
-            mlvl_features[i + 1] = upsample(project(mlvl_features[i + 1]))
-            node = self.nodes[i - start_level]
-            mlvl_features[i + 1] = node(mlvl_features[i + 1] +
-                                        mlvl_features[i])
-
-
-class DLAUpsample(BaseModule):
-    """Deep Layer Aggregation (DLA) Upsampling module for different scales
-    feature extraction, upsampling and fusion, It consists of groups of
-    IDAupsample modules.
-
-    Args:
-        start_level (int): The start layer.
-        channels (List[int]): List of input channels of multi-scale
-            feature maps.
-        scales(List[int]): List of scale of different layers' feature.
-        in_channels (NoneType, optional): List of input channels of
-            different scales. Default: None.
-        norm_cfg (dict, optional): Config dict for normalization layer.
-            Default: None.
-        use_dcn (bool, optional): Whether to use dcn in IDAup module.
-            Default: True.
-    """
-
-    def __init__(self,
-                 start_level,
-                 channels,
-                 scales,
-                 in_channels=None,
-                 norm_cfg=None,
-                 use_dcn=True,
-                 init_cfg=None):
-        super(DLAUpsample, self).__init__(init_cfg)
-        self.start_level = start_level
-        if in_channels is None:
-            in_channels = channels
-        self.channels = channels
-        channels = list(channels)
-        scales = np.array(scales, dtype=int)
-        for i in range(len(channels) - 1):
-            j = -i - 2
-            setattr(
-                self, 'ida_{}'.format(i),
-                IDAUpsample(channels[j], in_channels[j:],
-                            scales[j:] // scales[j], norm_cfg, use_dcn))
-            scales[j + 1:] = scales[j]
-            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
-
-    def forward(self, mlvl_features):
-        """Forward function.
-
-        Args:
-            mlvl_features(list[torch.Tensor]): Features from multi-scale
-                layers.
-
-        Returns:
-            tuple[torch.Tensor]: Up-sampled features of different layers.
-        """
-        outs = [mlvl_features[-1]]
-        for i in range(len(mlvl_features) - self.start_level - 1):
-            ida = getattr(self, 'ida_{}'.format(i))
-            ida(mlvl_features, len(mlvl_features) - i - 2, len(mlvl_features))
-            outs.insert(0, mlvl_features[-1])
-        return outs
-
-
-@NECKS.register_module()
-class DLANeck(BaseModule):
-    """DLA Neck.
-
-    Args:
-        in_channels (list[int], optional): List of input channels
-            of multi-scale feature map.
-        start_level (int, optional): The scale level where upsampling
-            starts. Default: 2.
-        end_level (int, optional): The scale level where upsampling
-            ends. Default: 5.
-        norm_cfg (dict, optional): Config dict for normalization
-            layer. Default: None.
-        use_dcn (bool, optional): Whether to use dcn in IDAup module.
-            Default: True.
-    """
-
-    def __init__(self,
-                 in_channels=[16, 32, 64, 128, 256, 512],
-                 start_level=2,
-                 end_level=5,
-                 norm_cfg=None,
-                 use_dcn=True,
-                 init_cfg=None):
-        super(DLANeck, self).__init__(init_cfg)
-        self.start_level = start_level
-        self.end_level = end_level
-        scales = [2**i for i in range(len(in_channels[self.start_level:]))]
-        self.dla_up = DLAUpsample(
-            start_level=self.start_level,
-            channels=in_channels[self.start_level:],
-            scales=scales,
-            norm_cfg=norm_cfg,
-            use_dcn=use_dcn)
-        self.ida_up = IDAUpsample(
-            in_channels[self.start_level],
-            in_channels[self.start_level:self.end_level],
-            [2**i for i in range(self.end_level - self.start_level)], norm_cfg,
-            use_dcn)
-
-    def forward(self, x):
-        mlvl_features = [x[i] for i in range(len(x))]
-        mlvl_features = self.dla_up(mlvl_features)
-        outs = []
-        for i in range(self.end_level - self.start_level):
-            outs.append(mlvl_features[i].clone())
-        self.ida_up(outs, 0, len(outs))
-        return [outs[-1]]
-
-    def init_weights(self):
-        for m in self.modules():
-            if isinstance(m, nn.ConvTranspose2d):
-                # In order to be consistent with the source code,
-                # reset the ConvTranspose2d initialization parameters
-                m.reset_parameters()
-                # Simulated bilinear upsampling kernel
-                fill_up_weights(m)
-            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.Conv2d):
-                # In order to be consistent with the source code,
-                # reset the Conv2d initialization parameters
-                m.reset_parameters()
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+from mmcv.cnn import ConvModule, build_conv_layer
+from mmcv.runner import BaseModule
+from torch import nn as nn
+
+from ..builder import NECKS
+
+
+def fill_up_weights(up):
+    """Simulated bilinear upsampling kernel.
+
+    Args:
+        up (nn.Module): ConvTranspose2d module.
+    """
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+
+
+class IDAUpsample(BaseModule):
+    """Iterative Deep Aggregation (IDA) Upsampling module to upsample features
+    of different scales to a similar scale.
+
+    Args:
+        out_channels (int): Number of output channels for DeformConv.
+        in_channels (List[int]): List of input channels of multi-scale
+            feature maps.
+        kernel_sizes (List[int]): List of size of the convolving
+            kernel of different scales.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        use_dcn (bool, optional): If True, use DCNv2. Default: True.
+    """
+
+    def __init__(
+        self,
+        out_channels,
+        in_channels,
+        kernel_sizes,
+        norm_cfg=None,
+        use_dcn=True,
+        init_cfg=None,
+    ):
+        super(IDAUpsample, self).__init__(init_cfg)
+        self.use_dcn = use_dcn
+        self.projs = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        self.nodes = nn.ModuleList()
+
+        for i in range(1, len(in_channels)):
+            in_channel = in_channels[i]
+            up_kernel_size = int(kernel_sizes[i])
+            proj = ConvModule(
+                in_channel,
+                out_channels,
+                3,
+                padding=1,
+                bias=True,
+                conv_cfg=dict(type='DCNv2') if self.use_dcn else None,
+                norm_cfg=norm_cfg)
+            node = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                bias=True,
+                conv_cfg=dict(type='DCNv2') if self.use_dcn else None,
+                norm_cfg=norm_cfg)
+            up = build_conv_layer(
+                dict(type='deconv'),
+                out_channels,
+                out_channels,
+                up_kernel_size * 2,
+                stride=up_kernel_size,
+                padding=up_kernel_size // 2,
+                output_padding=0,
+                groups=out_channels,
+                bias=False)
+
+            self.projs.append(proj)
+            self.ups.append(up)
+            self.nodes.append(node)
+
+    def forward(self, mlvl_features, start_level, end_level):
+        """Forward function.
+
+        Args:
+            mlvl_features (list[torch.Tensor]): Features from multiple layers.
+            start_level (int): Start layer for feature upsampling.
+            end_level (int): End layer for feature upsampling.
+        """
+        for i in range(start_level, end_level - 1):
+            upsample = self.ups[i - start_level]
+            project = self.projs[i - start_level]
+            mlvl_features[i + 1] = upsample(project(mlvl_features[i + 1]))
+            node = self.nodes[i - start_level]
+            mlvl_features[i + 1] = node(mlvl_features[i + 1] +
+                                        mlvl_features[i])
+
+
+class DLAUpsample(BaseModule):
+    """Deep Layer Aggregation (DLA) Upsampling module for different scales
+    feature extraction, upsampling and fusion, It consists of groups of
+    IDAupsample modules.
+
+    Args:
+        start_level (int): The start layer.
+        channels (List[int]): List of input channels of multi-scale
+            feature maps.
+        scales(List[int]): List of scale of different layers' feature.
+        in_channels (NoneType, optional): List of input channels of
+            different scales. Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        use_dcn (bool, optional): Whether to use dcn in IDAup module.
+            Default: True.
+    """
+
+    def __init__(self,
+                 start_level,
+                 channels,
+                 scales,
+                 in_channels=None,
+                 norm_cfg=None,
+                 use_dcn=True,
+                 init_cfg=None):
+        super(DLAUpsample, self).__init__(init_cfg)
+        self.start_level = start_level
+        if in_channels is None:
+            in_channels = channels
+        self.channels = channels
+        channels = list(channels)
+        scales = np.array(scales, dtype=int)
+        for i in range(len(channels) - 1):
+            j = -i - 2
+            setattr(
+                self, 'ida_{}'.format(i),
+                IDAUpsample(channels[j], in_channels[j:],
+                            scales[j:] // scales[j], norm_cfg, use_dcn))
+            scales[j + 1:] = scales[j]
+            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
+
+    def forward(self, mlvl_features):
+        """Forward function.
+
+        Args:
+            mlvl_features(list[torch.Tensor]): Features from multi-scale
+                layers.
+
+        Returns:
+            tuple[torch.Tensor]: Up-sampled features of different layers.
+        """
+        outs = [mlvl_features[-1]]
+        for i in range(len(mlvl_features) - self.start_level - 1):
+            ida = getattr(self, 'ida_{}'.format(i))
+            ida(mlvl_features, len(mlvl_features) - i - 2, len(mlvl_features))
+            outs.insert(0, mlvl_features[-1])
+        return outs
+
+
+@NECKS.register_module()
+class DLANeck(BaseModule):
+    """DLA Neck.
+
+    Args:
+        in_channels (list[int], optional): List of input channels
+            of multi-scale feature map.
+        start_level (int, optional): The scale level where upsampling
+            starts. Default: 2.
+        end_level (int, optional): The scale level where upsampling
+            ends. Default: 5.
+        norm_cfg (dict, optional): Config dict for normalization
+            layer. Default: None.
+        use_dcn (bool, optional): Whether to use dcn in IDAup module.
+            Default: True.
+    """
+
+    def __init__(self,
+                 in_channels=[16, 32, 64, 128, 256, 512],
+                 start_level=2,
+                 end_level=5,
+                 norm_cfg=None,
+                 use_dcn=True,
+                 init_cfg=None):
+        super(DLANeck, self).__init__(init_cfg)
+        self.start_level = start_level
+        self.end_level = end_level
+        scales = [2**i for i in range(len(in_channels[self.start_level:]))]
+        self.dla_up = DLAUpsample(
+            start_level=self.start_level,
+            channels=in_channels[self.start_level:],
+            scales=scales,
+            norm_cfg=norm_cfg,
+            use_dcn=use_dcn)
+        self.ida_up = IDAUpsample(
+            in_channels[self.start_level],
+            in_channels[self.start_level:self.end_level],
+            [2**i for i in range(self.end_level - self.start_level)], norm_cfg,
+            use_dcn)
+
+    def forward(self, x):
+        mlvl_features = [x[i] for i in range(len(x))]
+        mlvl_features = self.dla_up(mlvl_features)
+        outs = []
+        for i in range(self.end_level - self.start_level):
+            outs.append(mlvl_features[i].clone())
+        self.ida_up(outs, 0, len(outs))
+        return [outs[-1]]
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                # In order to be consistent with the source code,
+                # reset the ConvTranspose2d initialization parameters
+                m.reset_parameters()
+                # Simulated bilinear upsampling kernel
+                fill_up_weights(m)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Conv2d):
+                # In order to be consistent with the source code,
+                # reset the Conv2d initialization parameters
+                m.reset_parameters()
diff --git a/mmdet3d/models/necks/imvoxel_neck.py b/mmdet3d/models/necks/imvoxel_neck.py
index 8881491..0fb7d72 100644
--- a/mmdet3d/models/necks/imvoxel_neck.py
+++ b/mmdet3d/models/necks/imvoxel_neck.py
@@ -1,110 +1,110 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn import ConvModule
-from torch import nn
-
-from ..builder import NECKS
-
-
-@NECKS.register_module()
-class OutdoorImVoxelNeck(nn.Module):
-    """Neck for ImVoxelNet outdoor scenario.
-
-    Args:
-        in_channels (int): Input channels of multi-scale feature map.
-        out_channels (int): Output channels of multi-scale feature map.
-    """
-
-    def __init__(self, in_channels, out_channels):
-        super().__init__()
-        self.model = nn.Sequential(
-            ResModule(in_channels),
-            ConvModule(
-                in_channels=in_channels,
-                out_channels=in_channels * 2,
-                kernel_size=3,
-                stride=(1, 1, 2),
-                padding=1,
-                conv_cfg=dict(type='Conv3d'),
-                norm_cfg=dict(type='BN3d'),
-                act_cfg=dict(type='ReLU', inplace=True)),
-            ResModule(in_channels * 2),
-            ConvModule(
-                in_channels=in_channels * 2,
-                out_channels=in_channels * 4,
-                kernel_size=3,
-                stride=(1, 1, 2),
-                padding=1,
-                conv_cfg=dict(type='Conv3d'),
-                norm_cfg=dict(type='BN3d'),
-                act_cfg=dict(type='ReLU', inplace=True)),
-            ResModule(in_channels * 4),
-            ConvModule(
-                in_channels=in_channels * 4,
-                out_channels=out_channels,
-                kernel_size=3,
-                padding=(1, 1, 0),
-                conv_cfg=dict(type='Conv3d'),
-                norm_cfg=dict(type='BN3d'),
-                act_cfg=dict(type='ReLU', inplace=True)))
-
-    def forward(self, x):
-        """Forward function.
-
-        Args:
-            x (torch.Tensor): of shape (N, C_in, N_x, N_y, N_z).
-
-        Returns:
-            list[torch.Tensor]: of shape (N, C_out, N_y, N_x).
-        """
-        x = self.model.forward(x)
-        assert x.shape[-1] == 1
-        # Anchor3DHead axis order is (y, x).
-        return [x[..., 0].transpose(-1, -2)]
-
-    def init_weights(self):
-        """Initialize weights of neck."""
-        pass
-
-
-class ResModule(nn.Module):
-    """3d residual block for ImVoxelNeck.
-
-    Args:
-        n_channels (int): Input channels of a feature map.
-    """
-
-    def __init__(self, n_channels):
-        super().__init__()
-        self.conv0 = ConvModule(
-            in_channels=n_channels,
-            out_channels=n_channels,
-            kernel_size=3,
-            padding=1,
-            conv_cfg=dict(type='Conv3d'),
-            norm_cfg=dict(type='BN3d'),
-            act_cfg=dict(type='ReLU', inplace=True))
-        self.conv1 = ConvModule(
-            in_channels=n_channels,
-            out_channels=n_channels,
-            kernel_size=3,
-            padding=1,
-            conv_cfg=dict(type='Conv3d'),
-            norm_cfg=dict(type='BN3d'),
-            act_cfg=None)
-        self.activation = nn.ReLU(inplace=True)
-
-    def forward(self, x):
-        """Forward function.
-
-        Args:
-            x (torch.Tensor): of shape (N, C, N_x, N_y, N_z).
-
-        Returns:
-            torch.Tensor: 5d feature map.
-        """
-        identity = x
-        x = self.conv0(x)
-        x = self.conv1(x)
-        x = identity + x
-        x = self.activation(x)
-        return x
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule
+from torch import nn
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class OutdoorImVoxelNeck(nn.Module):
+    """Neck for ImVoxelNet outdoor scenario.
+
+    Args:
+        in_channels (int): Input channels of multi-scale feature map.
+        out_channels (int): Output channels of multi-scale feature map.
+    """
+
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.model = nn.Sequential(
+            ResModule(in_channels),
+            ConvModule(
+                in_channels=in_channels,
+                out_channels=in_channels * 2,
+                kernel_size=3,
+                stride=(1, 1, 2),
+                padding=1,
+                conv_cfg=dict(type='Conv3d'),
+                norm_cfg=dict(type='BN3d'),
+                act_cfg=dict(type='ReLU', inplace=True)),
+            ResModule(in_channels * 2),
+            ConvModule(
+                in_channels=in_channels * 2,
+                out_channels=in_channels * 4,
+                kernel_size=3,
+                stride=(1, 1, 2),
+                padding=1,
+                conv_cfg=dict(type='Conv3d'),
+                norm_cfg=dict(type='BN3d'),
+                act_cfg=dict(type='ReLU', inplace=True)),
+            ResModule(in_channels * 4),
+            ConvModule(
+                in_channels=in_channels * 4,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=(1, 1, 0),
+                conv_cfg=dict(type='Conv3d'),
+                norm_cfg=dict(type='BN3d'),
+                act_cfg=dict(type='ReLU', inplace=True)))
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): of shape (N, C_in, N_x, N_y, N_z).
+
+        Returns:
+            list[torch.Tensor]: of shape (N, C_out, N_y, N_x).
+        """
+        x = self.model.forward(x)
+        assert x.shape[-1] == 1
+        # Anchor3DHead axis order is (y, x).
+        return [x[..., 0].transpose(-1, -2)]
+
+    def init_weights(self):
+        """Initialize weights of neck."""
+        pass
+
+
+class ResModule(nn.Module):
+    """3d residual block for ImVoxelNeck.
+
+    Args:
+        n_channels (int): Input channels of a feature map.
+    """
+
+    def __init__(self, n_channels):
+        super().__init__()
+        self.conv0 = ConvModule(
+            in_channels=n_channels,
+            out_channels=n_channels,
+            kernel_size=3,
+            padding=1,
+            conv_cfg=dict(type='Conv3d'),
+            norm_cfg=dict(type='BN3d'),
+            act_cfg=dict(type='ReLU', inplace=True))
+        self.conv1 = ConvModule(
+            in_channels=n_channels,
+            out_channels=n_channels,
+            kernel_size=3,
+            padding=1,
+            conv_cfg=dict(type='Conv3d'),
+            norm_cfg=dict(type='BN3d'),
+            act_cfg=None)
+        self.activation = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): of shape (N, C, N_x, N_y, N_z).
+
+        Returns:
+            torch.Tensor: 5d feature map.
+        """
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = identity + x
+        x = self.activation(x)
+        return x
diff --git a/mmdet3d/models/necks/pointnet2_fp_neck.py b/mmdet3d/models/necks/pointnet2_fp_neck.py
index 62db0c1..d63ed5f 100644
--- a/mmdet3d/models/necks/pointnet2_fp_neck.py
+++ b/mmdet3d/models/necks/pointnet2_fp_neck.py
@@ -1,89 +1,89 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.runner import BaseModule
-from torch import nn as nn
-
-from mmdet3d.ops import PointFPModule
-from ..builder import NECKS
-
-
-@NECKS.register_module()
-class PointNetFPNeck(BaseModule):
-    r"""PointNet FP Module used in PointRCNN.
-
-    Refer to the `official code <https://github.com/charlesq34/pointnet2>`_.
-
-    .. code-block:: none
-
-        sa_n ----------------------------------------
-                                                     |
-        ... ---------------------------------        |
-                                             |       |
-        sa_1 -------------                   |       |
-                          |                  |       |
-        sa_0 -> fp_0 -> fp_module ->fp_1 -> ... -> fp_module -> fp_n
-
-    sa_n including sa_xyz (torch.Tensor) and sa_features (torch.Tensor)
-    fp_n including fp_xyz (torch.Tensor) and fp_features (torch.Tensor)
-
-    Args:
-        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
-        init_cfg (dict or list[dict], optional): Initialization config dict.
-            Default: None
-    """
-
-    def __init__(self, fp_channels, init_cfg=None):
-        super(PointNetFPNeck, self).__init__(init_cfg=init_cfg)
-
-        self.num_fp = len(fp_channels)
-        self.FP_modules = nn.ModuleList()
-        for cur_fp_mlps in fp_channels:
-            self.FP_modules.append(PointFPModule(mlp_channels=cur_fp_mlps))
-
-    def _extract_input(self, feat_dict):
-        """Extract inputs from features dictionary.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone, which may contain
-                the following keys and values:
-
-                - sa_xyz (list[torch.Tensor]): Points of each sa module
-                    in shape (N, 3).
-                - sa_features (list[torch.Tensor]): Output features of
-                    each sa module in shape (N, M).
-
-        Returns:
-            list[torch.Tensor]: Coordinates of multiple levels of points.
-            list[torch.Tensor]: Features of multiple levels of points.
-        """
-        sa_xyz = feat_dict['sa_xyz']
-        sa_features = feat_dict['sa_features']
-        assert len(sa_xyz) == len(sa_features)
-
-        return sa_xyz, sa_features
-
-    def forward(self, feat_dict):
-        """Forward pass.
-
-        Args:
-            feat_dict (dict): Feature dict from backbone.
-
-        Returns:
-            dict[str, torch.Tensor]: Outputs of the Neck.
-
-                - fp_xyz (torch.Tensor): The coordinates of fp features.
-                - fp_features (torch.Tensor): The features from the last
-                    feature propagation layers.
-        """
-        sa_xyz, sa_features = self._extract_input(feat_dict)
-
-        fp_feature = sa_features[-1]
-        fp_xyz = sa_xyz[-1]
-
-        for i in range(self.num_fp):
-            # consume the points in a bottom-up manner
-            fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)],
-                                            sa_features[-(i + 2)], fp_feature)
-            fp_xyz = sa_xyz[-(i + 2)]
-
-        ret = dict(fp_xyz=fp_xyz, fp_features=fp_feature)
-        return ret
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import BaseModule
+from torch import nn as nn
+
+from mmdet3d.ops import PointFPModule
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class PointNetFPNeck(BaseModule):
+    r"""PointNet FP Module used in PointRCNN.
+
+    Refer to the `official code <https://github.com/charlesq34/pointnet2>`_.
+
+    .. code-block:: none
+
+        sa_n ----------------------------------------
+                                                     |
+        ... ---------------------------------        |
+                                             |       |
+        sa_1 -------------                   |       |
+                          |                  |       |
+        sa_0 -> fp_0 -> fp_module ->fp_1 -> ... -> fp_module -> fp_n
+
+    sa_n including sa_xyz (torch.Tensor) and sa_features (torch.Tensor)
+    fp_n including fp_xyz (torch.Tensor) and fp_features (torch.Tensor)
+
+    Args:
+        fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self, fp_channels, init_cfg=None):
+        super(PointNetFPNeck, self).__init__(init_cfg=init_cfg)
+
+        self.num_fp = len(fp_channels)
+        self.FP_modules = nn.ModuleList()
+        for cur_fp_mlps in fp_channels:
+            self.FP_modules.append(PointFPModule(mlp_channels=cur_fp_mlps))
+
+    def _extract_input(self, feat_dict):
+        """Extract inputs from features dictionary.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone, which may contain
+                the following keys and values:
+
+                - sa_xyz (list[torch.Tensor]): Points of each sa module
+                    in shape (N, 3).
+                - sa_features (list[torch.Tensor]): Output features of
+                    each sa module in shape (N, M).
+
+        Returns:
+            list[torch.Tensor]: Coordinates of multiple levels of points.
+            list[torch.Tensor]: Features of multiple levels of points.
+        """
+        sa_xyz = feat_dict['sa_xyz']
+        sa_features = feat_dict['sa_features']
+        assert len(sa_xyz) == len(sa_features)
+
+        return sa_xyz, sa_features
+
+    def forward(self, feat_dict):
+        """Forward pass.
+
+        Args:
+            feat_dict (dict): Feature dict from backbone.
+
+        Returns:
+            dict[str, torch.Tensor]: Outputs of the Neck.
+
+                - fp_xyz (torch.Tensor): The coordinates of fp features.
+                - fp_features (torch.Tensor): The features from the last
+                    feature propagation layers.
+        """
+        sa_xyz, sa_features = self._extract_input(feat_dict)
+
+        fp_feature = sa_features[-1]
+        fp_xyz = sa_xyz[-1]
+
+        for i in range(self.num_fp):
+            # consume the points in a bottom-up manner
+            fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)],
+                                            sa_features[-(i + 2)], fp_feature)
+            fp_xyz = sa_xyz[-(i + 2)]
+
+        ret = dict(fp_xyz=fp_xyz, fp_features=fp_feature)
+        return ret
diff --git a/mmdet3d/models/necks/second_fpn.py b/mmdet3d/models/necks/second_fpn.py
index ef1b3de..55c6335 100644
--- a/mmdet3d/models/necks/second_fpn.py
+++ b/mmdet3d/models/necks/second_fpn.py
@@ -1,91 +1,91 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
-from mmcv.runner import BaseModule, auto_fp16
-from torch import nn as nn
-
-from ..builder import NECKS
-
-
-@NECKS.register_module()
-class SECONDFPN(BaseModule):
-    """FPN used in SECOND/PointPillars/PartA2/MVXNet.
-
-    Args:
-        in_channels (list[int]): Input channels of multi-scale feature maps.
-        out_channels (list[int]): Output channels of feature maps.
-        upsample_strides (list[int]): Strides used to upsample the
-            feature maps.
-        norm_cfg (dict): Config dict of normalization layers.
-        upsample_cfg (dict): Config dict of upsample layers.
-        conv_cfg (dict): Config dict of conv layers.
-        use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
-    """
-
-    def __init__(self,
-                 in_channels=[128, 128, 256],
-                 out_channels=[256, 256, 256],
-                 upsample_strides=[1, 2, 4],
-                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
-                 upsample_cfg=dict(type='deconv', bias=False),
-                 conv_cfg=dict(type='Conv2d', bias=False),
-                 use_conv_for_no_stride=False,
-                 init_cfg=None):
-        # if for GroupNorm,
-        # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True)
-        super(SECONDFPN, self).__init__(init_cfg=init_cfg)
-        assert len(out_channels) == len(upsample_strides) == len(in_channels)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.fp16_enabled = False
-
-        deblocks = []
-        for i, out_channel in enumerate(out_channels):
-            stride = upsample_strides[i]
-            if stride > 1 or (stride == 1 and not use_conv_for_no_stride):
-                upsample_layer = build_upsample_layer(
-                    upsample_cfg,
-                    in_channels=in_channels[i],
-                    out_channels=out_channel,
-                    kernel_size=upsample_strides[i],
-                    stride=upsample_strides[i])
-            else:
-                stride = np.round(1 / stride).astype(np.int64)
-                upsample_layer = build_conv_layer(
-                    conv_cfg,
-                    in_channels=in_channels[i],
-                    out_channels=out_channel,
-                    kernel_size=stride,
-                    stride=stride)
-
-            deblock = nn.Sequential(upsample_layer,
-                                    build_norm_layer(norm_cfg, out_channel)[1],
-                                    nn.ReLU(inplace=True))
-            deblocks.append(deblock)
-        self.deblocks = nn.ModuleList(deblocks)
-
-        if init_cfg is None:
-            self.init_cfg = [
-                dict(type='Kaiming', layer='ConvTranspose2d'),
-                dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0)
-            ]
-
-    @auto_fp16()
-    def forward(self, x):
-        """Forward function.
-
-        Args:
-            x (torch.Tensor): 4D Tensor in (N, C, H, W) shape.
-
-        Returns:
-            list[torch.Tensor]: Multi-level feature maps.
-        """
-        assert len(x) == len(self.in_channels)
-        ups = [deblock(x[i]) for i, deblock in enumerate(self.deblocks)]
-
-        if len(ups) > 1:
-            out = torch.cat(ups, dim=1)
-        else:
-            out = ups[0]
-        return [out]
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
+from mmcv.runner import BaseModule, auto_fp16
+from torch import nn as nn
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class SECONDFPN(BaseModule):
+    """FPN used in SECOND/PointPillars/PartA2/MVXNet.
+
+    Args:
+        in_channels (list[int]): Input channels of multi-scale feature maps.
+        out_channels (list[int]): Output channels of feature maps.
+        upsample_strides (list[int]): Strides used to upsample the
+            feature maps.
+        norm_cfg (dict): Config dict of normalization layers.
+        upsample_cfg (dict): Config dict of upsample layers.
+        conv_cfg (dict): Config dict of conv layers.
+        use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
+    """
+
+    def __init__(self,
+                 in_channels=[128, 128, 256],
+                 out_channels=[256, 256, 256],
+                 upsample_strides=[1, 2, 4],
+                 norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+                 upsample_cfg=dict(type='deconv', bias=False),
+                 conv_cfg=dict(type='Conv2d', bias=False),
+                 use_conv_for_no_stride=False,
+                 init_cfg=None):
+        # if for GroupNorm,
+        # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True)
+        super(SECONDFPN, self).__init__(init_cfg=init_cfg)
+        assert len(out_channels) == len(upsample_strides) == len(in_channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.fp16_enabled = False
+
+        deblocks = []
+        for i, out_channel in enumerate(out_channels):
+            stride = upsample_strides[i]
+            if stride > 1 or (stride == 1 and not use_conv_for_no_stride):
+                upsample_layer = build_upsample_layer(
+                    upsample_cfg,
+                    in_channels=in_channels[i],
+                    out_channels=out_channel,
+                    kernel_size=upsample_strides[i],
+                    stride=upsample_strides[i])
+            else:
+                stride = np.round(1 / stride).astype(np.int64)
+                upsample_layer = build_conv_layer(
+                    conv_cfg,
+                    in_channels=in_channels[i],
+                    out_channels=out_channel,
+                    kernel_size=stride,
+                    stride=stride)
+
+            deblock = nn.Sequential(upsample_layer,
+                                    build_norm_layer(norm_cfg, out_channel)[1],
+                                    nn.ReLU(inplace=True))
+            deblocks.append(deblock)
+        self.deblocks = nn.ModuleList(deblocks)
+
+        if init_cfg is None:
+            self.init_cfg = [
+                dict(type='Kaiming', layer='ConvTranspose2d'),
+                dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0)
+            ]
+
+    @auto_fp16()
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): 4D Tensor in (N, C, H, W) shape.
+
+        Returns:
+            list[torch.Tensor]: Multi-level feature maps.
+        """
+        assert len(x) == len(self.in_channels)
+        ups = [deblock(x[i]) for i, deblock in enumerate(self.deblocks)]
+
+        if len(ups) > 1:
+            out = torch.cat(ups, dim=1)
+        else:
+            out = ups[0]
+        return [out]
diff --git a/mmdet3d/models/roi_heads/__init__.py b/mmdet3d/models/roi_heads/__init__.py
index e607570..916d4a1 100644
--- a/mmdet3d/models/roi_heads/__init__.py
+++ b/mmdet3d/models/roi_heads/__init__.py
@@ -1,14 +1,14 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base_3droi_head import Base3DRoIHead
-from .bbox_heads import PartA2BboxHead
-from .h3d_roi_head import H3DRoIHead
-from .mask_heads import PointwiseSemanticHead, PrimitiveHead
-from .part_aggregation_roi_head import PartAggregationROIHead
-from .point_rcnn_roi_head import PointRCNNRoIHead
-from .roi_extractors import Single3DRoIAwareExtractor, SingleRoIExtractor
-
-__all__ = [
-    'Base3DRoIHead', 'PartAggregationROIHead', 'PointwiseSemanticHead',
-    'Single3DRoIAwareExtractor', 'PartA2BboxHead', 'SingleRoIExtractor',
-    'H3DRoIHead', 'PrimitiveHead', 'PointRCNNRoIHead'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_3droi_head import Base3DRoIHead
+from .bbox_heads import PartA2BboxHead
+from .h3d_roi_head import H3DRoIHead
+from .mask_heads import PointwiseSemanticHead, PrimitiveHead
+from .part_aggregation_roi_head import PartAggregationROIHead
+from .point_rcnn_roi_head import PointRCNNRoIHead
+from .roi_extractors import Single3DRoIAwareExtractor, SingleRoIExtractor
+
+__all__ = [
+    'Base3DRoIHead', 'PartAggregationROIHead', 'PointwiseSemanticHead',
+    'Single3DRoIAwareExtractor', 'PartA2BboxHead', 'SingleRoIExtractor',
+    'H3DRoIHead', 'PrimitiveHead', 'PointRCNNRoIHead'
+]
diff --git a/mmdet3d/models/roi_heads/base_3droi_head.py b/mmdet3d/models/roi_heads/base_3droi_head.py
index e1816ff..e5941a2 100644
--- a/mmdet3d/models/roi_heads/base_3droi_head.py
+++ b/mmdet3d/models/roi_heads/base_3droi_head.py
@@ -1,98 +1,98 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from abc import ABCMeta, abstractmethod
-
-from mmcv.runner import BaseModule
-
-
-class Base3DRoIHead(BaseModule, metaclass=ABCMeta):
-    """Base class for 3d RoIHeads."""
-
-    def __init__(self,
-                 bbox_head=None,
-                 mask_roi_extractor=None,
-                 mask_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(Base3DRoIHead, self).__init__(init_cfg=init_cfg)
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-
-        if bbox_head is not None:
-            self.init_bbox_head(bbox_head)
-
-        if mask_head is not None:
-            self.init_mask_head(mask_roi_extractor, mask_head)
-
-        self.init_assigner_sampler()
-
-    @property
-    def with_bbox(self):
-        """bool: whether the RoIHead has box head"""
-        return hasattr(self, 'bbox_head') and self.bbox_head is not None
-
-    @property
-    def with_mask(self):
-        """bool: whether the RoIHead has mask head"""
-        return hasattr(self, 'mask_head') and self.mask_head is not None
-
-    @abstractmethod
-    def init_bbox_head(self):
-        """Initialize the box head."""
-        pass
-
-    @abstractmethod
-    def init_mask_head(self):
-        """Initialize maek head."""
-        pass
-
-    @abstractmethod
-    def init_assigner_sampler(self):
-        """Initialize assigner and sampler."""
-        pass
-
-    @abstractmethod
-    def forward_train(self,
-                      x,
-                      img_metas,
-                      proposal_list,
-                      gt_bboxes,
-                      gt_labels,
-                      gt_bboxes_ignore=None,
-                      **kwargs):
-        """Forward function during training.
-
-        Args:
-            x (dict): Contains features from the first stage.
-            img_metas (list[dict]): Meta info of each image.
-            proposal_list (list[dict]): Proposal information from rpn.
-            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]):
-                GT bboxes of each sample. The bboxes are encapsulated
-                by 3D box structures.
-            gt_labels (list[torch.LongTensor]): GT labels of each sample.
-            gt_bboxes_ignore (list[torch.Tensor], optional):
-                Ground truth boxes to be ignored.
-
-        Returns:
-            dict[str, torch.Tensor]: Losses from each head.
-        """
-        pass
-
-    def simple_test(self,
-                    x,
-                    proposal_list,
-                    img_metas,
-                    proposals=None,
-                    rescale=False,
-                    **kwargs):
-        """Test without augmentation."""
-        pass
-
-    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
-        """Test with augmentations.
-
-        If rescale is False, then returned bboxes and masks will fit the scale
-        of imgs[0].
-        """
-        pass
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+from mmcv.runner import BaseModule
+
+
+class Base3DRoIHead(BaseModule, metaclass=ABCMeta):
+    """Base class for 3d RoIHeads."""
+
+    def __init__(self,
+                 bbox_head=None,
+                 mask_roi_extractor=None,
+                 mask_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(Base3DRoIHead, self).__init__(init_cfg=init_cfg)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if bbox_head is not None:
+            self.init_bbox_head(bbox_head)
+
+        if mask_head is not None:
+            self.init_mask_head(mask_roi_extractor, mask_head)
+
+        self.init_assigner_sampler()
+
+    @property
+    def with_bbox(self):
+        """bool: whether the RoIHead has box head"""
+        return hasattr(self, 'bbox_head') and self.bbox_head is not None
+
+    @property
+    def with_mask(self):
+        """bool: whether the RoIHead has mask head"""
+        return hasattr(self, 'mask_head') and self.mask_head is not None
+
+    @abstractmethod
+    def init_bbox_head(self):
+        """Initialize the box head."""
+        pass
+
+    @abstractmethod
+    def init_mask_head(self):
+        """Initialize maek head."""
+        pass
+
+    @abstractmethod
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        pass
+
+    @abstractmethod
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      proposal_list,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None,
+                      **kwargs):
+        """Forward function during training.
+
+        Args:
+            x (dict): Contains features from the first stage.
+            img_metas (list[dict]): Meta info of each image.
+            proposal_list (list[dict]): Proposal information from rpn.
+            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]):
+                GT bboxes of each sample. The bboxes are encapsulated
+                by 3D box structures.
+            gt_labels (list[torch.LongTensor]): GT labels of each sample.
+            gt_bboxes_ignore (list[torch.Tensor], optional):
+                Ground truth boxes to be ignored.
+
+        Returns:
+            dict[str, torch.Tensor]: Losses from each head.
+        """
+        pass
+
+    def simple_test(self,
+                    x,
+                    proposal_list,
+                    img_metas,
+                    proposals=None,
+                    rescale=False,
+                    **kwargs):
+        """Test without augmentation."""
+        pass
+
+    def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
+        """Test with augmentations.
+
+        If rescale is False, then returned bboxes and masks will fit the scale
+        of imgs[0].
+        """
+        pass
diff --git a/mmdet3d/models/roi_heads/bbox_heads/__init__.py b/mmdet3d/models/roi_heads/bbox_heads/__init__.py
index fd7a6b0..daf4f90 100644
--- a/mmdet3d/models/roi_heads/bbox_heads/__init__.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/__init__.py
@@ -1,14 +1,14 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.models.roi_heads.bbox_heads import (BBoxHead, ConvFCBBoxHead,
-                                               DoubleConvFCBBoxHead,
-                                               Shared2FCBBoxHead,
-                                               Shared4Conv1FCBBoxHead)
-from .h3d_bbox_head import H3DBboxHead
-from .parta2_bbox_head import PartA2BboxHead
-from .point_rcnn_bbox_head import PointRCNNBboxHead
-
-__all__ = [
-    'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
-    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'PartA2BboxHead',
-    'H3DBboxHead', 'PointRCNNBboxHead'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.roi_heads.bbox_heads import (BBoxHead, ConvFCBBoxHead,
+                                               DoubleConvFCBBoxHead,
+                                               Shared2FCBBoxHead,
+                                               Shared4Conv1FCBBoxHead)
+from .h3d_bbox_head import H3DBboxHead
+from .parta2_bbox_head import PartA2BboxHead
+from .point_rcnn_bbox_head import PointRCNNBboxHead
+
+__all__ = [
+    'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
+    'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'PartA2BboxHead',
+    'H3DBboxHead', 'PointRCNNBboxHead'
+]
diff --git a/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py b/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py
index a8bd11a..f960d2b 100644
--- a/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py
@@ -1,925 +1,925 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule
-from torch import nn as nn
-from torch.nn import functional as F
-
-from mmdet3d.core.bbox import DepthInstance3DBoxes
-from mmdet3d.core.post_processing import aligned_3d_nms
-from mmdet3d.models.builder import HEADS, build_loss
-from mmdet3d.models.losses import chamfer_distance
-from mmdet3d.ops import build_sa_module
-from mmdet.core import build_bbox_coder, multi_apply
-
-
-@HEADS.register_module()
-class H3DBboxHead(BaseModule):
-    r"""Bbox head of `H3DNet <https://arxiv.org/abs/2006.05682>`_.
-
-    Args:
-        num_classes (int): The number of classes.
-        surface_matching_cfg (dict): Config for surface primitive matching.
-        line_matching_cfg (dict): Config for line primitive matching.
-        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
-            decoding boxes.
-        train_cfg (dict): Config for training.
-        test_cfg (dict): Config for testing.
-        gt_per_seed (int): Number of ground truth votes generated
-            from each seed point.
-        num_proposal (int): Number of proposal votes generated.
-        feat_channels (tuple[int]): Convolution channels of
-            prediction layer.
-        primitive_feat_refine_streams (int): The number of mlps to
-            refine primitive feature.
-        primitive_refine_channels (tuple[int]): Convolution channels of
-            prediction layer.
-        upper_thresh (float): Threshold for line matching.
-        surface_thresh (float): Threshold for surface matching.
-        line_thresh (float): Threshold for line matching.
-        conv_cfg (dict): Config of convolution in prediction layer.
-        norm_cfg (dict): Config of BN in prediction layer.
-        objectness_loss (dict): Config of objectness loss.
-        center_loss (dict): Config of center loss.
-        dir_class_loss (dict): Config of direction classification loss.
-        dir_res_loss (dict): Config of direction residual regression loss.
-        size_class_loss (dict): Config of size classification loss.
-        size_res_loss (dict): Config of size residual regression loss.
-        semantic_loss (dict): Config of point-wise semantic segmentation loss.
-        cues_objectness_loss (dict): Config of cues objectness loss.
-        cues_semantic_loss (dict): Config of cues semantic loss.
-        proposal_objectness_loss (dict): Config of proposal objectness
-            loss.
-        primitive_center_loss (dict): Config of primitive center regression
-            loss.
-    """
-
-    def __init__(self,
-                 num_classes,
-                 suface_matching_cfg,
-                 line_matching_cfg,
-                 bbox_coder,
-                 train_cfg=None,
-                 test_cfg=None,
-                 gt_per_seed=1,
-                 num_proposal=256,
-                 feat_channels=(128, 128),
-                 primitive_feat_refine_streams=2,
-                 primitive_refine_channels=[128, 128, 128],
-                 upper_thresh=100.0,
-                 surface_thresh=0.5,
-                 line_thresh=0.5,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 objectness_loss=None,
-                 center_loss=None,
-                 dir_class_loss=None,
-                 dir_res_loss=None,
-                 size_class_loss=None,
-                 size_res_loss=None,
-                 semantic_loss=None,
-                 cues_objectness_loss=None,
-                 cues_semantic_loss=None,
-                 proposal_objectness_loss=None,
-                 primitive_center_loss=None,
-                 init_cfg=None):
-        super(H3DBboxHead, self).__init__(init_cfg=init_cfg)
-        self.num_classes = num_classes
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        self.gt_per_seed = gt_per_seed
-        self.num_proposal = num_proposal
-        self.with_angle = bbox_coder['with_rot']
-        self.upper_thresh = upper_thresh
-        self.surface_thresh = surface_thresh
-        self.line_thresh = line_thresh
-
-        self.objectness_loss = build_loss(objectness_loss)
-        self.center_loss = build_loss(center_loss)
-        self.dir_class_loss = build_loss(dir_class_loss)
-        self.dir_res_loss = build_loss(dir_res_loss)
-        self.size_class_loss = build_loss(size_class_loss)
-        self.size_res_loss = build_loss(size_res_loss)
-        self.semantic_loss = build_loss(semantic_loss)
-
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-        self.num_sizes = self.bbox_coder.num_sizes
-        self.num_dir_bins = self.bbox_coder.num_dir_bins
-
-        self.cues_objectness_loss = build_loss(cues_objectness_loss)
-        self.cues_semantic_loss = build_loss(cues_semantic_loss)
-        self.proposal_objectness_loss = build_loss(proposal_objectness_loss)
-        self.primitive_center_loss = build_loss(primitive_center_loss)
-
-        assert suface_matching_cfg['mlp_channels'][-1] == \
-            line_matching_cfg['mlp_channels'][-1]
-
-        # surface center matching
-        self.surface_center_matcher = build_sa_module(suface_matching_cfg)
-        # line center matching
-        self.line_center_matcher = build_sa_module(line_matching_cfg)
-
-        # Compute the matching scores
-        matching_feat_dims = suface_matching_cfg['mlp_channels'][-1]
-        self.matching_conv = ConvModule(
-            matching_feat_dims,
-            matching_feat_dims,
-            1,
-            padding=0,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            bias=True,
-            inplace=True)
-        self.matching_pred = nn.Conv1d(matching_feat_dims, 2, 1)
-
-        # Compute the semantic matching scores
-        self.semantic_matching_conv = ConvModule(
-            matching_feat_dims,
-            matching_feat_dims,
-            1,
-            padding=0,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            bias=True,
-            inplace=True)
-        self.semantic_matching_pred = nn.Conv1d(matching_feat_dims, 2, 1)
-
-        # Surface feature aggregation
-        self.surface_feats_aggregation = list()
-        for k in range(primitive_feat_refine_streams):
-            self.surface_feats_aggregation.append(
-                ConvModule(
-                    matching_feat_dims,
-                    matching_feat_dims,
-                    1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    bias=True,
-                    inplace=True))
-        self.surface_feats_aggregation = nn.Sequential(
-            *self.surface_feats_aggregation)
-
-        # Line feature aggregation
-        self.line_feats_aggregation = list()
-        for k in range(primitive_feat_refine_streams):
-            self.line_feats_aggregation.append(
-                ConvModule(
-                    matching_feat_dims,
-                    matching_feat_dims,
-                    1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    bias=True,
-                    inplace=True))
-        self.line_feats_aggregation = nn.Sequential(
-            *self.line_feats_aggregation)
-
-        # surface center(6) + line center(12)
-        prev_channel = 18 * matching_feat_dims
-        self.bbox_pred = nn.ModuleList()
-        for k in range(len(primitive_refine_channels)):
-            self.bbox_pred.append(
-                ConvModule(
-                    prev_channel,
-                    primitive_refine_channels[k],
-                    1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    bias=True,
-                    inplace=False))
-            prev_channel = primitive_refine_channels[k]
-
-        # Final object detection
-        # Objectness scores (2), center residual (3),
-        # heading class+residual (num_heading_bin*2), size class +
-        # residual(num_size_cluster*4)
-        conv_out_channel = (2 + 3 + bbox_coder['num_dir_bins'] * 2 +
-                            bbox_coder['num_sizes'] * 4 + self.num_classes)
-        self.bbox_pred.append(nn.Conv1d(prev_channel, conv_out_channel, 1))
-
-    def forward(self, feats_dict, sample_mod):
-        """Forward pass.
-
-        Args:
-            feats_dict (dict): Feature dict from backbone.
-            sample_mod (str): Sample mode for vote aggregation layer.
-                valid modes are "vote", "seed" and "random".
-
-        Returns:
-            dict: Predictions of vote head.
-        """
-        ret_dict = {}
-        aggregated_points = feats_dict['aggregated_points']
-        original_feature = feats_dict['aggregated_features']
-        batch_size = original_feature.shape[0]
-        object_proposal = original_feature.shape[2]
-
-        # Extract surface center, features and semantic predictions
-        z_center = feats_dict['pred_z_center']
-        xy_center = feats_dict['pred_xy_center']
-        z_semantic = feats_dict['sem_cls_scores_z']
-        xy_semantic = feats_dict['sem_cls_scores_xy']
-        z_feature = feats_dict['aggregated_features_z']
-        xy_feature = feats_dict['aggregated_features_xy']
-        # Extract line points and features
-        line_center = feats_dict['pred_line_center']
-        line_feature = feats_dict['aggregated_features_line']
-
-        surface_center_pred = torch.cat((z_center, xy_center), dim=1)
-        ret_dict['surface_center_pred'] = surface_center_pred
-        ret_dict['surface_sem_pred'] = torch.cat((z_semantic, xy_semantic),
-                                                 dim=1)
-
-        # Extract the surface and line centers of rpn proposals
-        rpn_proposals = feats_dict['proposal_list']
-        rpn_proposals_bbox = DepthInstance3DBoxes(
-            rpn_proposals.reshape(-1, 7).clone(),
-            box_dim=rpn_proposals.shape[-1],
-            with_yaw=self.with_angle,
-            origin=(0.5, 0.5, 0.5))
-
-        obj_surface_center, obj_line_center = \
-            rpn_proposals_bbox.get_surface_line_center()
-        obj_surface_center = obj_surface_center.reshape(
-            batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3)
-        obj_line_center = obj_line_center.reshape(batch_size, -1, 12,
-                                                  3).transpose(1, 2).reshape(
-                                                      batch_size, -1, 3)
-        ret_dict['surface_center_object'] = obj_surface_center
-        ret_dict['line_center_object'] = obj_line_center
-
-        # aggregate primitive z and xy features to rpn proposals
-        surface_center_feature_pred = torch.cat((z_feature, xy_feature), dim=2)
-        surface_center_feature_pred = torch.cat(
-            (surface_center_feature_pred.new_zeros(
-                (batch_size, 6, surface_center_feature_pred.shape[2])),
-             surface_center_feature_pred),
-            dim=1)
-
-        surface_xyz, surface_features, _ = self.surface_center_matcher(
-            surface_center_pred,
-            surface_center_feature_pred,
-            target_xyz=obj_surface_center)
-
-        # aggregate primitive line features to rpn proposals
-        line_feature = torch.cat((line_feature.new_zeros(
-            (batch_size, 12, line_feature.shape[2])), line_feature),
-                                 dim=1)
-        line_xyz, line_features, _ = self.line_center_matcher(
-            line_center, line_feature, target_xyz=obj_line_center)
-
-        # combine the surface and line features
-        combine_features = torch.cat((surface_features, line_features), dim=2)
-
-        matching_features = self.matching_conv(combine_features)
-        matching_score = self.matching_pred(matching_features)
-        ret_dict['matching_score'] = matching_score.transpose(2, 1)
-
-        semantic_matching_features = self.semantic_matching_conv(
-            combine_features)
-        semantic_matching_score = self.semantic_matching_pred(
-            semantic_matching_features)
-        ret_dict['semantic_matching_score'] = \
-            semantic_matching_score.transpose(2, 1)
-
-        surface_features = self.surface_feats_aggregation(surface_features)
-        line_features = self.line_feats_aggregation(line_features)
-
-        # Combine all surface and line features
-        surface_features = surface_features.view(batch_size, -1,
-                                                 object_proposal)
-        line_features = line_features.view(batch_size, -1, object_proposal)
-
-        combine_feature = torch.cat((surface_features, line_features), dim=1)
-
-        # Final bbox predictions
-        bbox_predictions = self.bbox_pred[0](combine_feature)
-        bbox_predictions += original_feature
-        for conv_module in self.bbox_pred[1:]:
-            bbox_predictions = conv_module(bbox_predictions)
-
-        refine_decode_res = self.bbox_coder.split_pred(
-            bbox_predictions[:, :self.num_classes + 2],
-            bbox_predictions[:, self.num_classes + 2:], aggregated_points)
-        for key in refine_decode_res.keys():
-            ret_dict[key + '_optimized'] = refine_decode_res[key]
-        return ret_dict
-
-    def loss(self,
-             bbox_preds,
-             points,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             pts_semantic_mask=None,
-             pts_instance_mask=None,
-             img_metas=None,
-             rpn_targets=None,
-             gt_bboxes_ignore=None):
-        """Compute loss.
-
-        Args:
-            bbox_preds (dict): Predictions from forward of h3d bbox head.
-            points (list[torch.Tensor]): Input points.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each sample.
-            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
-            pts_semantic_mask (list[torch.Tensor]): Point-wise
-                semantic mask.
-            pts_instance_mask (list[torch.Tensor]): Point-wise
-                instance mask.
-            img_metas (list[dict]): Contain pcd and img's meta info.
-            rpn_targets (Tuple) : Targets generated by rpn head.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding.
-
-        Returns:
-            dict: Losses of H3dnet.
-        """
-        (vote_targets, vote_target_masks, size_class_targets, size_res_targets,
-         dir_class_targets, dir_res_targets, center_targets, _, mask_targets,
-         valid_gt_masks, objectness_targets, objectness_weights,
-         box_loss_weights, valid_gt_weights) = rpn_targets
-
-        losses = {}
-
-        # calculate refined proposal loss
-        refined_proposal_loss = self.get_proposal_stage_loss(
-            bbox_preds,
-            size_class_targets,
-            size_res_targets,
-            dir_class_targets,
-            dir_res_targets,
-            center_targets,
-            mask_targets,
-            objectness_targets,
-            objectness_weights,
-            box_loss_weights,
-            valid_gt_weights,
-            suffix='_optimized')
-        for key in refined_proposal_loss.keys():
-            losses[key + '_optimized'] = refined_proposal_loss[key]
-
-        bbox3d_optimized = self.bbox_coder.decode(
-            bbox_preds, suffix='_optimized')
-
-        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
-                                   pts_semantic_mask, pts_instance_mask,
-                                   bbox_preds)
-
-        (cues_objectness_label, cues_sem_label, proposal_objectness_label,
-         cues_mask, cues_match_mask, proposal_objectness_mask,
-         cues_matching_label, obj_surface_line_center) = targets
-
-        # match scores for each geometric primitive
-        objectness_scores = bbox_preds['matching_score']
-        # match scores for the semantics of primitives
-        objectness_scores_sem = bbox_preds['semantic_matching_score']
-
-        primitive_objectness_loss = self.cues_objectness_loss(
-            objectness_scores.transpose(2, 1),
-            cues_objectness_label,
-            weight=cues_mask,
-            avg_factor=cues_mask.sum() + 1e-6)
-
-        primitive_sem_loss = self.cues_semantic_loss(
-            objectness_scores_sem.transpose(2, 1),
-            cues_sem_label,
-            weight=cues_mask,
-            avg_factor=cues_mask.sum() + 1e-6)
-
-        objectness_scores = bbox_preds['obj_scores_optimized']
-        objectness_loss_refine = self.proposal_objectness_loss(
-            objectness_scores.transpose(2, 1), proposal_objectness_label)
-        primitive_matching_loss = (objectness_loss_refine *
-                                   cues_match_mask).sum() / (
-                                       cues_match_mask.sum() + 1e-6) * 0.5
-        primitive_sem_matching_loss = (
-            objectness_loss_refine * proposal_objectness_mask).sum() / (
-                proposal_objectness_mask.sum() + 1e-6) * 0.5
-
-        # Get the object surface center here
-        batch_size, object_proposal = bbox3d_optimized.shape[:2]
-        refined_bbox = DepthInstance3DBoxes(
-            bbox3d_optimized.reshape(-1, 7).clone(),
-            box_dim=bbox3d_optimized.shape[-1],
-            with_yaw=self.with_angle,
-            origin=(0.5, 0.5, 0.5))
-
-        pred_obj_surface_center, pred_obj_line_center = \
-            refined_bbox.get_surface_line_center()
-        pred_obj_surface_center = pred_obj_surface_center.reshape(
-            batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3)
-        pred_obj_line_center = pred_obj_line_center.reshape(
-            batch_size, -1, 12, 3).transpose(1, 2).reshape(batch_size, -1, 3)
-        pred_surface_line_center = torch.cat(
-            (pred_obj_surface_center, pred_obj_line_center), 1)
-
-        square_dist = self.primitive_center_loss(pred_surface_line_center,
-                                                 obj_surface_line_center)
-
-        match_dist = torch.sqrt(square_dist.sum(dim=-1) + 1e-6)
-        primitive_centroid_reg_loss = torch.sum(
-            match_dist * cues_matching_label) / (
-                cues_matching_label.sum() + 1e-6)
-
-        refined_loss = dict(
-            primitive_objectness_loss=primitive_objectness_loss,
-            primitive_sem_loss=primitive_sem_loss,
-            primitive_matching_loss=primitive_matching_loss,
-            primitive_sem_matching_loss=primitive_sem_matching_loss,
-            primitive_centroid_reg_loss=primitive_centroid_reg_loss)
-
-        losses.update(refined_loss)
-
-        return losses
-
-    def get_bboxes(self,
-                   points,
-                   bbox_preds,
-                   input_metas,
-                   rescale=False,
-                   suffix=''):
-        """Generate bboxes from vote head predictions.
-
-        Args:
-            points (torch.Tensor): Input points.
-            bbox_preds (dict): Predictions from vote head.
-            input_metas (list[dict]): Point cloud and image's meta info.
-            rescale (bool): Whether to rescale bboxes.
-
-        Returns:
-            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
-        """
-        # decode boxes
-        obj_scores = F.softmax(
-            bbox_preds['obj_scores' + suffix], dim=-1)[..., -1]
-
-        sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1)
-
-        prediction_collection = {}
-        prediction_collection['center'] = bbox_preds['center' + suffix]
-        prediction_collection['dir_class'] = bbox_preds['dir_class']
-        prediction_collection['dir_res'] = bbox_preds['dir_res' + suffix]
-        prediction_collection['size_class'] = bbox_preds['size_class']
-        prediction_collection['size_res'] = bbox_preds['size_res' + suffix]
-
-        bbox3d = self.bbox_coder.decode(prediction_collection)
-
-        batch_size = bbox3d.shape[0]
-        results = list()
-        for b in range(batch_size):
-            bbox_selected, score_selected, labels = self.multiclass_nms_single(
-                obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3],
-                input_metas[b])
-            bbox = input_metas[b]['box_type_3d'](
-                bbox_selected,
-                box_dim=bbox_selected.shape[-1],
-                with_yaw=self.bbox_coder.with_rot)
-            results.append((bbox, score_selected, labels))
-
-        return results
-
-    def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
-                              input_meta):
-        """Multi-class nms in single batch.
-
-        Args:
-            obj_scores (torch.Tensor): Objectness score of bounding boxes.
-            sem_scores (torch.Tensor): semantic class score of bounding boxes.
-            bbox (torch.Tensor): Predicted bounding boxes.
-            points (torch.Tensor): Input points.
-            input_meta (dict): Point cloud and image's meta info.
-
-        Returns:
-            tuple[torch.Tensor]: Bounding boxes, scores and labels.
-        """
-        bbox = input_meta['box_type_3d'](
-            bbox,
-            box_dim=bbox.shape[-1],
-            with_yaw=self.bbox_coder.with_rot,
-            origin=(0.5, 0.5, 0.5))
-        box_indices = bbox.points_in_boxes_all(points)
-
-        corner3d = bbox.corners
-        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
-        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
-        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
-
-        nonempty_box_mask = box_indices.T.sum(1) > 5
-
-        bbox_classes = torch.argmax(sem_scores, -1)
-        nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
-                                      obj_scores[nonempty_box_mask],
-                                      bbox_classes[nonempty_box_mask],
-                                      self.test_cfg.nms_thr)
-
-        # filter empty boxes and boxes with low score
-        scores_mask = (obj_scores > self.test_cfg.score_thr)
-        nonempty_box_inds = torch.nonzero(
-            nonempty_box_mask, as_tuple=False).flatten()
-        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
-            0, nonempty_box_inds[nms_selected], 1)
-        selected = (nonempty_mask.bool() & scores_mask.bool())
-
-        if self.test_cfg.per_class_proposal:
-            bbox_selected, score_selected, labels = [], [], []
-            for k in range(sem_scores.shape[-1]):
-                bbox_selected.append(bbox[selected].tensor)
-                score_selected.append(obj_scores[selected] *
-                                      sem_scores[selected][:, k])
-                labels.append(
-                    torch.zeros_like(bbox_classes[selected]).fill_(k))
-            bbox_selected = torch.cat(bbox_selected, 0)
-            score_selected = torch.cat(score_selected, 0)
-            labels = torch.cat(labels, 0)
-        else:
-            bbox_selected = bbox[selected].tensor
-            score_selected = obj_scores[selected]
-            labels = bbox_classes[selected]
-
-        return bbox_selected, score_selected, labels
-
-    def get_proposal_stage_loss(self,
-                                bbox_preds,
-                                size_class_targets,
-                                size_res_targets,
-                                dir_class_targets,
-                                dir_res_targets,
-                                center_targets,
-                                mask_targets,
-                                objectness_targets,
-                                objectness_weights,
-                                box_loss_weights,
-                                valid_gt_weights,
-                                suffix=''):
-        """Compute loss for the aggregation module.
-
-        Args:
-            bbox_preds (dict): Predictions from forward of vote head.
-            size_class_targets (torch.Tensor): Ground truth
-                size class of each prediction bounding box.
-            size_res_targets (torch.Tensor): Ground truth
-                size residual of each prediction bounding box.
-            dir_class_targets (torch.Tensor): Ground truth
-                direction class of each prediction bounding box.
-            dir_res_targets (torch.Tensor): Ground truth
-                direction residual of each prediction bounding box.
-            center_targets (torch.Tensor): Ground truth center
-                of each prediction bounding box.
-            mask_targets (torch.Tensor): Validation of each
-                prediction bounding box.
-            objectness_targets (torch.Tensor): Ground truth
-                objectness label of each prediction bounding box.
-            objectness_weights (torch.Tensor): Weights of objectness
-                loss for each prediction bounding box.
-            box_loss_weights (torch.Tensor): Weights of regression
-                loss for each prediction bounding box.
-            valid_gt_weights (torch.Tensor): Validation of each
-                ground truth bounding box.
-
-        Returns:
-            dict: Losses of aggregation module.
-        """
-        # calculate objectness loss
-        objectness_loss = self.objectness_loss(
-            bbox_preds['obj_scores' + suffix].transpose(2, 1),
-            objectness_targets,
-            weight=objectness_weights)
-
-        # calculate center loss
-        source2target_loss, target2source_loss = self.center_loss(
-            bbox_preds['center' + suffix],
-            center_targets,
-            src_weight=box_loss_weights,
-            dst_weight=valid_gt_weights)
-        center_loss = source2target_loss + target2source_loss
-
-        # calculate direction class loss
-        dir_class_loss = self.dir_class_loss(
-            bbox_preds['dir_class' + suffix].transpose(2, 1),
-            dir_class_targets,
-            weight=box_loss_weights)
-
-        # calculate direction residual loss
-        batch_size, proposal_num = size_class_targets.shape[:2]
-        heading_label_one_hot = dir_class_targets.new_zeros(
-            (batch_size, proposal_num, self.num_dir_bins))
-        heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
-        dir_res_norm = (bbox_preds['dir_res_norm' + suffix] *
-                        heading_label_one_hot).sum(dim=-1)
-        dir_res_loss = self.dir_res_loss(
-            dir_res_norm, dir_res_targets, weight=box_loss_weights)
-
-        # calculate size class loss
-        size_class_loss = self.size_class_loss(
-            bbox_preds['size_class' + suffix].transpose(2, 1),
-            size_class_targets,
-            weight=box_loss_weights)
-
-        # calculate size residual loss
-        one_hot_size_targets = box_loss_weights.new_zeros(
-            (batch_size, proposal_num, self.num_sizes))
-        one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1)
-        one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
-            -1).repeat(1, 1, 1, 3)
-        size_residual_norm = (bbox_preds['size_res_norm' + suffix] *
-                              one_hot_size_targets_expand).sum(dim=2)
-        box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat(
-            1, 1, 3)
-        size_res_loss = self.size_res_loss(
-            size_residual_norm,
-            size_res_targets,
-            weight=box_loss_weights_expand)
-
-        # calculate semantic loss
-        semantic_loss = self.semantic_loss(
-            bbox_preds['sem_scores' + suffix].transpose(2, 1),
-            mask_targets,
-            weight=box_loss_weights)
-
-        losses = dict(
-            objectness_loss=objectness_loss,
-            semantic_loss=semantic_loss,
-            center_loss=center_loss,
-            dir_class_loss=dir_class_loss,
-            dir_res_loss=dir_res_loss,
-            size_class_loss=size_class_loss,
-            size_res_loss=size_res_loss)
-
-        return losses
-
-    def get_targets(self,
-                    points,
-                    gt_bboxes_3d,
-                    gt_labels_3d,
-                    pts_semantic_mask=None,
-                    pts_instance_mask=None,
-                    bbox_preds=None):
-        """Generate targets of proposal module.
-
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
-            pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
-                label of each batch.
-            pts_instance_mask (list[torch.Tensor]): Point-wise instance
-                label of each batch.
-            bbox_preds (torch.Tensor): Bounding box predictions of vote head.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of proposal module.
-        """
-        # find empty example
-        valid_gt_masks = list()
-        gt_num = list()
-        for index in range(len(gt_labels_3d)):
-            if len(gt_labels_3d[index]) == 0:
-                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
-                    1, gt_bboxes_3d[index].tensor.shape[-1])
-                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
-                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
-                valid_gt_masks.append(gt_labels_3d[index].new_zeros(1))
-                gt_num.append(1)
-            else:
-                valid_gt_masks.append(gt_labels_3d[index].new_ones(
-                    gt_labels_3d[index].shape))
-                gt_num.append(gt_labels_3d[index].shape[0])
-
-        if pts_semantic_mask is None:
-            pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
-            pts_instance_mask = [None for i in range(len(gt_labels_3d))]
-
-        aggregated_points = [
-            bbox_preds['aggregated_points'][i]
-            for i in range(len(gt_labels_3d))
-        ]
-
-        surface_center_pred = [
-            bbox_preds['surface_center_pred'][i]
-            for i in range(len(gt_labels_3d))
-        ]
-
-        line_center_pred = [
-            bbox_preds['pred_line_center'][i]
-            for i in range(len(gt_labels_3d))
-        ]
-
-        surface_center_object = [
-            bbox_preds['surface_center_object'][i]
-            for i in range(len(gt_labels_3d))
-        ]
-
-        line_center_object = [
-            bbox_preds['line_center_object'][i]
-            for i in range(len(gt_labels_3d))
-        ]
-
-        surface_sem_pred = [
-            bbox_preds['surface_sem_pred'][i]
-            for i in range(len(gt_labels_3d))
-        ]
-
-        line_sem_pred = [
-            bbox_preds['sem_cls_scores_line'][i]
-            for i in range(len(gt_labels_3d))
-        ]
-
-        (cues_objectness_label, cues_sem_label, proposal_objectness_label,
-         cues_mask, cues_match_mask, proposal_objectness_mask,
-         cues_matching_label, obj_surface_line_center) = multi_apply(
-             self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d,
-             pts_semantic_mask, pts_instance_mask, aggregated_points,
-             surface_center_pred, line_center_pred, surface_center_object,
-             line_center_object, surface_sem_pred, line_sem_pred)
-
-        cues_objectness_label = torch.stack(cues_objectness_label)
-        cues_sem_label = torch.stack(cues_sem_label)
-        proposal_objectness_label = torch.stack(proposal_objectness_label)
-        cues_mask = torch.stack(cues_mask)
-        cues_match_mask = torch.stack(cues_match_mask)
-        proposal_objectness_mask = torch.stack(proposal_objectness_mask)
-        cues_matching_label = torch.stack(cues_matching_label)
-        obj_surface_line_center = torch.stack(obj_surface_line_center)
-
-        return (cues_objectness_label, cues_sem_label,
-                proposal_objectness_label, cues_mask, cues_match_mask,
-                proposal_objectness_mask, cues_matching_label,
-                obj_surface_line_center)
-
-    def get_targets_single(self,
-                           points,
-                           gt_bboxes_3d,
-                           gt_labels_3d,
-                           pts_semantic_mask=None,
-                           pts_instance_mask=None,
-                           aggregated_points=None,
-                           pred_surface_center=None,
-                           pred_line_center=None,
-                           pred_obj_surface_center=None,
-                           pred_obj_line_center=None,
-                           pred_surface_sem=None,
-                           pred_line_sem=None):
-        """Generate targets for primitive cues for single batch.
-
-        Args:
-            points (torch.Tensor): Points of each batch.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
-                boxes of each batch.
-            gt_labels_3d (torch.Tensor): Labels of each batch.
-            pts_semantic_mask (torch.Tensor): Point-wise semantic
-                label of each batch.
-            pts_instance_mask (torch.Tensor): Point-wise instance
-                label of each batch.
-            aggregated_points (torch.Tensor): Aggregated points from
-                vote aggregation layer.
-            pred_surface_center (torch.Tensor): Prediction of surface center.
-            pred_line_center (torch.Tensor): Prediction of line center.
-            pred_obj_surface_center (torch.Tensor): Objectness prediction
-                of surface center.
-            pred_obj_line_center (torch.Tensor): Objectness prediction of
-                line center.
-            pred_surface_sem (torch.Tensor): Semantic prediction of
-                surface center.
-            pred_line_sem (torch.Tensor): Semantic prediction of line center.
-        Returns:
-            tuple[torch.Tensor]: Targets for primitive cues.
-        """
-        device = points.device
-        gt_bboxes_3d = gt_bboxes_3d.to(device)
-        num_proposals = aggregated_points.shape[0]
-        gt_center = gt_bboxes_3d.gravity_center
-
-        dist1, dist2, ind1, _ = chamfer_distance(
-            aggregated_points.unsqueeze(0),
-            gt_center.unsqueeze(0),
-            reduction='none')
-        # Set assignment
-        object_assignment = ind1.squeeze(0)
-
-        # Generate objectness label and mask
-        # objectness_label: 1 if pred object center is within
-        # self.train_cfg['near_threshold'] of any GT object
-        # objectness_mask: 0 if pred object center is in gray
-        # zone (DONOTCARE), 1 otherwise
-        euclidean_dist1 = torch.sqrt(dist1.squeeze(0) + 1e-6)
-        proposal_objectness_label = euclidean_dist1.new_zeros(
-            num_proposals, dtype=torch.long)
-        proposal_objectness_mask = euclidean_dist1.new_zeros(num_proposals)
-
-        gt_sem = gt_labels_3d[object_assignment]
-
-        obj_surface_center, obj_line_center = \
-            gt_bboxes_3d.get_surface_line_center()
-        obj_surface_center = obj_surface_center.reshape(-1, 6,
-                                                        3).transpose(0, 1)
-        obj_line_center = obj_line_center.reshape(-1, 12, 3).transpose(0, 1)
-        obj_surface_center = obj_surface_center[:, object_assignment].reshape(
-            1, -1, 3)
-        obj_line_center = obj_line_center[:,
-                                          object_assignment].reshape(1, -1, 3)
-
-        surface_sem = torch.argmax(pred_surface_sem, dim=1).float()
-        line_sem = torch.argmax(pred_line_sem, dim=1).float()
-
-        dist_surface, _, surface_ind, _ = chamfer_distance(
-            obj_surface_center,
-            pred_surface_center.unsqueeze(0),
-            reduction='none')
-        dist_line, _, line_ind, _ = chamfer_distance(
-            obj_line_center, pred_line_center.unsqueeze(0), reduction='none')
-
-        surface_sel = pred_surface_center[surface_ind.squeeze(0)]
-        line_sel = pred_line_center[line_ind.squeeze(0)]
-        surface_sel_sem = surface_sem[surface_ind.squeeze(0)]
-        line_sel_sem = line_sem[line_ind.squeeze(0)]
-
-        surface_sel_sem_gt = gt_sem.repeat(6).float()
-        line_sel_sem_gt = gt_sem.repeat(12).float()
-
-        euclidean_dist_surface = torch.sqrt(dist_surface.squeeze(0) + 1e-6)
-        euclidean_dist_line = torch.sqrt(dist_line.squeeze(0) + 1e-6)
-        objectness_label_surface = euclidean_dist_line.new_zeros(
-            num_proposals * 6, dtype=torch.long)
-        objectness_mask_surface = euclidean_dist_line.new_zeros(num_proposals *
-                                                                6)
-        objectness_label_line = euclidean_dist_line.new_zeros(
-            num_proposals * 12, dtype=torch.long)
-        objectness_mask_line = euclidean_dist_line.new_zeros(num_proposals *
-                                                             12)
-        objectness_label_surface_sem = euclidean_dist_line.new_zeros(
-            num_proposals * 6, dtype=torch.long)
-        objectness_label_line_sem = euclidean_dist_line.new_zeros(
-            num_proposals * 12, dtype=torch.long)
-
-        euclidean_dist_obj_surface = torch.sqrt((
-            (pred_obj_surface_center - surface_sel)**2).sum(dim=-1) + 1e-6)
-        euclidean_dist_obj_line = torch.sqrt(
-            torch.sum((pred_obj_line_center - line_sel)**2, dim=-1) + 1e-6)
-
-        # Objectness score just with centers
-        proposal_objectness_label[
-            euclidean_dist1 < self.train_cfg['near_threshold']] = 1
-        proposal_objectness_mask[
-            euclidean_dist1 < self.train_cfg['near_threshold']] = 1
-        proposal_objectness_mask[
-            euclidean_dist1 > self.train_cfg['far_threshold']] = 1
-
-        objectness_label_surface[
-            (euclidean_dist_obj_surface <
-             self.train_cfg['label_surface_threshold']) *
-            (euclidean_dist_surface <
-             self.train_cfg['mask_surface_threshold'])] = 1
-        objectness_label_surface_sem[
-            (euclidean_dist_obj_surface <
-             self.train_cfg['label_surface_threshold']) *
-            (euclidean_dist_surface < self.train_cfg['mask_surface_threshold'])
-            * (surface_sel_sem == surface_sel_sem_gt)] = 1
-
-        objectness_label_line[
-            (euclidean_dist_obj_line < self.train_cfg['label_line_threshold'])
-            *
-            (euclidean_dist_line < self.train_cfg['mask_line_threshold'])] = 1
-        objectness_label_line_sem[
-            (euclidean_dist_obj_line < self.train_cfg['label_line_threshold'])
-            * (euclidean_dist_line < self.train_cfg['mask_line_threshold']) *
-            (line_sel_sem == line_sel_sem_gt)] = 1
-
-        objectness_label_surface_obj = proposal_objectness_label.repeat(6)
-        objectness_mask_surface_obj = proposal_objectness_mask.repeat(6)
-        objectness_label_line_obj = proposal_objectness_label.repeat(12)
-        objectness_mask_line_obj = proposal_objectness_mask.repeat(12)
-
-        objectness_mask_surface = objectness_mask_surface_obj
-        objectness_mask_line = objectness_mask_line_obj
-
-        cues_objectness_label = torch.cat(
-            (objectness_label_surface, objectness_label_line), 0)
-        cues_sem_label = torch.cat(
-            (objectness_label_surface_sem, objectness_label_line_sem), 0)
-        cues_mask = torch.cat((objectness_mask_surface, objectness_mask_line),
-                              0)
-
-        objectness_label_surface *= objectness_label_surface_obj
-        objectness_label_line *= objectness_label_line_obj
-        cues_matching_label = torch.cat(
-            (objectness_label_surface, objectness_label_line), 0)
-
-        objectness_label_surface_sem *= objectness_label_surface_obj
-        objectness_label_line_sem *= objectness_label_line_obj
-
-        cues_match_mask = (torch.sum(
-            cues_objectness_label.view(18, num_proposals), dim=0) >=
-                           1).float()
-
-        obj_surface_line_center = torch.cat(
-            (obj_surface_center, obj_line_center), 1).squeeze(0)
-
-        return (cues_objectness_label, cues_sem_label,
-                proposal_objectness_label, cues_mask, cues_match_mask,
-                proposal_objectness_mask, cues_matching_label,
-                obj_surface_line_center)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.core.bbox import DepthInstance3DBoxes
+from mmdet3d.core.post_processing import aligned_3d_nms
+from mmdet3d.models.builder import HEADS, build_loss
+from mmdet3d.models.losses import chamfer_distance
+from mmdet3d.ops import build_sa_module
+from mmdet.core import build_bbox_coder, multi_apply
+
+
+@HEADS.register_module()
+class H3DBboxHead(BaseModule):
+    r"""Bbox head of `H3DNet <https://arxiv.org/abs/2006.05682>`_.
+
+    Args:
+        num_classes (int): The number of classes.
+        surface_matching_cfg (dict): Config for surface primitive matching.
+        line_matching_cfg (dict): Config for line primitive matching.
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
+            decoding boxes.
+        train_cfg (dict): Config for training.
+        test_cfg (dict): Config for testing.
+        gt_per_seed (int): Number of ground truth votes generated
+            from each seed point.
+        num_proposal (int): Number of proposal votes generated.
+        feat_channels (tuple[int]): Convolution channels of
+            prediction layer.
+        primitive_feat_refine_streams (int): The number of mlps to
+            refine primitive feature.
+        primitive_refine_channels (tuple[int]): Convolution channels of
+            prediction layer.
+        upper_thresh (float): Threshold for line matching.
+        surface_thresh (float): Threshold for surface matching.
+        line_thresh (float): Threshold for line matching.
+        conv_cfg (dict): Config of convolution in prediction layer.
+        norm_cfg (dict): Config of BN in prediction layer.
+        objectness_loss (dict): Config of objectness loss.
+        center_loss (dict): Config of center loss.
+        dir_class_loss (dict): Config of direction classification loss.
+        dir_res_loss (dict): Config of direction residual regression loss.
+        size_class_loss (dict): Config of size classification loss.
+        size_res_loss (dict): Config of size residual regression loss.
+        semantic_loss (dict): Config of point-wise semantic segmentation loss.
+        cues_objectness_loss (dict): Config of cues objectness loss.
+        cues_semantic_loss (dict): Config of cues semantic loss.
+        proposal_objectness_loss (dict): Config of proposal objectness
+            loss.
+        primitive_center_loss (dict): Config of primitive center regression
+            loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 suface_matching_cfg,
+                 line_matching_cfg,
+                 bbox_coder,
+                 train_cfg=None,
+                 test_cfg=None,
+                 gt_per_seed=1,
+                 num_proposal=256,
+                 feat_channels=(128, 128),
+                 primitive_feat_refine_streams=2,
+                 primitive_refine_channels=[128, 128, 128],
+                 upper_thresh=100.0,
+                 surface_thresh=0.5,
+                 line_thresh=0.5,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 objectness_loss=None,
+                 center_loss=None,
+                 dir_class_loss=None,
+                 dir_res_loss=None,
+                 size_class_loss=None,
+                 size_res_loss=None,
+                 semantic_loss=None,
+                 cues_objectness_loss=None,
+                 cues_semantic_loss=None,
+                 proposal_objectness_loss=None,
+                 primitive_center_loss=None,
+                 init_cfg=None):
+        super(H3DBboxHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.gt_per_seed = gt_per_seed
+        self.num_proposal = num_proposal
+        self.with_angle = bbox_coder['with_rot']
+        self.upper_thresh = upper_thresh
+        self.surface_thresh = surface_thresh
+        self.line_thresh = line_thresh
+
+        self.objectness_loss = build_loss(objectness_loss)
+        self.center_loss = build_loss(center_loss)
+        self.dir_class_loss = build_loss(dir_class_loss)
+        self.dir_res_loss = build_loss(dir_res_loss)
+        self.size_class_loss = build_loss(size_class_loss)
+        self.size_res_loss = build_loss(size_res_loss)
+        self.semantic_loss = build_loss(semantic_loss)
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.num_sizes = self.bbox_coder.num_sizes
+        self.num_dir_bins = self.bbox_coder.num_dir_bins
+
+        self.cues_objectness_loss = build_loss(cues_objectness_loss)
+        self.cues_semantic_loss = build_loss(cues_semantic_loss)
+        self.proposal_objectness_loss = build_loss(proposal_objectness_loss)
+        self.primitive_center_loss = build_loss(primitive_center_loss)
+
+        assert suface_matching_cfg['mlp_channels'][-1] == \
+            line_matching_cfg['mlp_channels'][-1]
+
+        # surface center matching
+        self.surface_center_matcher = build_sa_module(suface_matching_cfg)
+        # line center matching
+        self.line_center_matcher = build_sa_module(line_matching_cfg)
+
+        # Compute the matching scores
+        matching_feat_dims = suface_matching_cfg['mlp_channels'][-1]
+        self.matching_conv = ConvModule(
+            matching_feat_dims,
+            matching_feat_dims,
+            1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True,
+            inplace=True)
+        self.matching_pred = nn.Conv1d(matching_feat_dims, 2, 1)
+
+        # Compute the semantic matching scores
+        self.semantic_matching_conv = ConvModule(
+            matching_feat_dims,
+            matching_feat_dims,
+            1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True,
+            inplace=True)
+        self.semantic_matching_pred = nn.Conv1d(matching_feat_dims, 2, 1)
+
+        # Surface feature aggregation
+        self.surface_feats_aggregation = list()
+        for k in range(primitive_feat_refine_streams):
+            self.surface_feats_aggregation.append(
+                ConvModule(
+                    matching_feat_dims,
+                    matching_feat_dims,
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    bias=True,
+                    inplace=True))
+        self.surface_feats_aggregation = nn.Sequential(
+            *self.surface_feats_aggregation)
+
+        # Line feature aggregation
+        self.line_feats_aggregation = list()
+        for k in range(primitive_feat_refine_streams):
+            self.line_feats_aggregation.append(
+                ConvModule(
+                    matching_feat_dims,
+                    matching_feat_dims,
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    bias=True,
+                    inplace=True))
+        self.line_feats_aggregation = nn.Sequential(
+            *self.line_feats_aggregation)
+
+        # surface center(6) + line center(12)
+        prev_channel = 18 * matching_feat_dims
+        self.bbox_pred = nn.ModuleList()
+        for k in range(len(primitive_refine_channels)):
+            self.bbox_pred.append(
+                ConvModule(
+                    prev_channel,
+                    primitive_refine_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    bias=True,
+                    inplace=False))
+            prev_channel = primitive_refine_channels[k]
+
+        # Final object detection
+        # Objectness scores (2), center residual (3),
+        # heading class+residual (num_heading_bin*2), size class +
+        # residual(num_size_cluster*4)
+        conv_out_channel = (2 + 3 + bbox_coder['num_dir_bins'] * 2 +
+                            bbox_coder['num_sizes'] * 4 + self.num_classes)
+        self.bbox_pred.append(nn.Conv1d(prev_channel, conv_out_channel, 1))
+
+    def forward(self, feats_dict, sample_mod):
+        """Forward pass.
+
+        Args:
+            feats_dict (dict): Feature dict from backbone.
+            sample_mod (str): Sample mode for vote aggregation layer.
+                valid modes are "vote", "seed" and "random".
+
+        Returns:
+            dict: Predictions of vote head.
+        """
+        ret_dict = {}
+        aggregated_points = feats_dict['aggregated_points']
+        original_feature = feats_dict['aggregated_features']
+        batch_size = original_feature.shape[0]
+        object_proposal = original_feature.shape[2]
+
+        # Extract surface center, features and semantic predictions
+        z_center = feats_dict['pred_z_center']
+        xy_center = feats_dict['pred_xy_center']
+        z_semantic = feats_dict['sem_cls_scores_z']
+        xy_semantic = feats_dict['sem_cls_scores_xy']
+        z_feature = feats_dict['aggregated_features_z']
+        xy_feature = feats_dict['aggregated_features_xy']
+        # Extract line points and features
+        line_center = feats_dict['pred_line_center']
+        line_feature = feats_dict['aggregated_features_line']
+
+        surface_center_pred = torch.cat((z_center, xy_center), dim=1)
+        ret_dict['surface_center_pred'] = surface_center_pred
+        ret_dict['surface_sem_pred'] = torch.cat((z_semantic, xy_semantic),
+                                                 dim=1)
+
+        # Extract the surface and line centers of rpn proposals
+        rpn_proposals = feats_dict['proposal_list']
+        rpn_proposals_bbox = DepthInstance3DBoxes(
+            rpn_proposals.reshape(-1, 7).clone(),
+            box_dim=rpn_proposals.shape[-1],
+            with_yaw=self.with_angle,
+            origin=(0.5, 0.5, 0.5))
+
+        obj_surface_center, obj_line_center = \
+            rpn_proposals_bbox.get_surface_line_center()
+        obj_surface_center = obj_surface_center.reshape(
+            batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3)
+        obj_line_center = obj_line_center.reshape(batch_size, -1, 12,
+                                                  3).transpose(1, 2).reshape(
+                                                      batch_size, -1, 3)
+        ret_dict['surface_center_object'] = obj_surface_center
+        ret_dict['line_center_object'] = obj_line_center
+
+        # aggregate primitive z and xy features to rpn proposals
+        surface_center_feature_pred = torch.cat((z_feature, xy_feature), dim=2)
+        surface_center_feature_pred = torch.cat(
+            (surface_center_feature_pred.new_zeros(
+                (batch_size, 6, surface_center_feature_pred.shape[2])),
+             surface_center_feature_pred),
+            dim=1)
+
+        surface_xyz, surface_features, _ = self.surface_center_matcher(
+            surface_center_pred,
+            surface_center_feature_pred,
+            target_xyz=obj_surface_center)
+
+        # aggregate primitive line features to rpn proposals
+        line_feature = torch.cat((line_feature.new_zeros(
+            (batch_size, 12, line_feature.shape[2])), line_feature),
+                                 dim=1)
+        line_xyz, line_features, _ = self.line_center_matcher(
+            line_center, line_feature, target_xyz=obj_line_center)
+
+        # combine the surface and line features
+        combine_features = torch.cat((surface_features, line_features), dim=2)
+
+        matching_features = self.matching_conv(combine_features)
+        matching_score = self.matching_pred(matching_features)
+        ret_dict['matching_score'] = matching_score.transpose(2, 1)
+
+        semantic_matching_features = self.semantic_matching_conv(
+            combine_features)
+        semantic_matching_score = self.semantic_matching_pred(
+            semantic_matching_features)
+        ret_dict['semantic_matching_score'] = \
+            semantic_matching_score.transpose(2, 1)
+
+        surface_features = self.surface_feats_aggregation(surface_features)
+        line_features = self.line_feats_aggregation(line_features)
+
+        # Combine all surface and line features
+        surface_features = surface_features.view(batch_size, -1,
+                                                 object_proposal)
+        line_features = line_features.view(batch_size, -1, object_proposal)
+
+        combine_feature = torch.cat((surface_features, line_features), dim=1)
+
+        # Final bbox predictions
+        bbox_predictions = self.bbox_pred[0](combine_feature)
+        bbox_predictions += original_feature
+        for conv_module in self.bbox_pred[1:]:
+            bbox_predictions = conv_module(bbox_predictions)
+
+        refine_decode_res = self.bbox_coder.split_pred(
+            bbox_predictions[:, :self.num_classes + 2],
+            bbox_predictions[:, self.num_classes + 2:], aggregated_points)
+        for key in refine_decode_res.keys():
+            ret_dict[key + '_optimized'] = refine_decode_res[key]
+        return ret_dict
+
+    def loss(self,
+             bbox_preds,
+             points,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             pts_semantic_mask=None,
+             pts_instance_mask=None,
+             img_metas=None,
+             rpn_targets=None,
+             gt_bboxes_ignore=None):
+        """Compute loss.
+
+        Args:
+            bbox_preds (dict): Predictions from forward of h3d bbox head.
+            points (list[torch.Tensor]): Input points.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each sample.
+            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
+            pts_semantic_mask (list[torch.Tensor]): Point-wise
+                semantic mask.
+            pts_instance_mask (list[torch.Tensor]): Point-wise
+                instance mask.
+            img_metas (list[dict]): Contain pcd and img's meta info.
+            rpn_targets (Tuple) : Targets generated by rpn head.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding.
+
+        Returns:
+            dict: Losses of H3dnet.
+        """
+        (vote_targets, vote_target_masks, size_class_targets, size_res_targets,
+         dir_class_targets, dir_res_targets, center_targets, _, mask_targets,
+         valid_gt_masks, objectness_targets, objectness_weights,
+         box_loss_weights, valid_gt_weights) = rpn_targets
+
+        losses = {}
+
+        # calculate refined proposal loss
+        refined_proposal_loss = self.get_proposal_stage_loss(
+            bbox_preds,
+            size_class_targets,
+            size_res_targets,
+            dir_class_targets,
+            dir_res_targets,
+            center_targets,
+            mask_targets,
+            objectness_targets,
+            objectness_weights,
+            box_loss_weights,
+            valid_gt_weights,
+            suffix='_optimized')
+        for key in refined_proposal_loss.keys():
+            losses[key + '_optimized'] = refined_proposal_loss[key]
+
+        bbox3d_optimized = self.bbox_coder.decode(
+            bbox_preds, suffix='_optimized')
+
+        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
+                                   pts_semantic_mask, pts_instance_mask,
+                                   bbox_preds)
+
+        (cues_objectness_label, cues_sem_label, proposal_objectness_label,
+         cues_mask, cues_match_mask, proposal_objectness_mask,
+         cues_matching_label, obj_surface_line_center) = targets
+
+        # match scores for each geometric primitive
+        objectness_scores = bbox_preds['matching_score']
+        # match scores for the semantics of primitives
+        objectness_scores_sem = bbox_preds['semantic_matching_score']
+
+        primitive_objectness_loss = self.cues_objectness_loss(
+            objectness_scores.transpose(2, 1),
+            cues_objectness_label,
+            weight=cues_mask,
+            avg_factor=cues_mask.sum() + 1e-6)
+
+        primitive_sem_loss = self.cues_semantic_loss(
+            objectness_scores_sem.transpose(2, 1),
+            cues_sem_label,
+            weight=cues_mask,
+            avg_factor=cues_mask.sum() + 1e-6)
+
+        objectness_scores = bbox_preds['obj_scores_optimized']
+        objectness_loss_refine = self.proposal_objectness_loss(
+            objectness_scores.transpose(2, 1), proposal_objectness_label)
+        primitive_matching_loss = (objectness_loss_refine *
+                                   cues_match_mask).sum() / (
+                                       cues_match_mask.sum() + 1e-6) * 0.5
+        primitive_sem_matching_loss = (
+            objectness_loss_refine * proposal_objectness_mask).sum() / (
+                proposal_objectness_mask.sum() + 1e-6) * 0.5
+
+        # Get the object surface center here
+        batch_size, object_proposal = bbox3d_optimized.shape[:2]
+        refined_bbox = DepthInstance3DBoxes(
+            bbox3d_optimized.reshape(-1, 7).clone(),
+            box_dim=bbox3d_optimized.shape[-1],
+            with_yaw=self.with_angle,
+            origin=(0.5, 0.5, 0.5))
+
+        pred_obj_surface_center, pred_obj_line_center = \
+            refined_bbox.get_surface_line_center()
+        pred_obj_surface_center = pred_obj_surface_center.reshape(
+            batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3)
+        pred_obj_line_center = pred_obj_line_center.reshape(
+            batch_size, -1, 12, 3).transpose(1, 2).reshape(batch_size, -1, 3)
+        pred_surface_line_center = torch.cat(
+            (pred_obj_surface_center, pred_obj_line_center), 1)
+
+        square_dist = self.primitive_center_loss(pred_surface_line_center,
+                                                 obj_surface_line_center)
+
+        match_dist = torch.sqrt(square_dist.sum(dim=-1) + 1e-6)
+        primitive_centroid_reg_loss = torch.sum(
+            match_dist * cues_matching_label) / (
+                cues_matching_label.sum() + 1e-6)
+
+        refined_loss = dict(
+            primitive_objectness_loss=primitive_objectness_loss,
+            primitive_sem_loss=primitive_sem_loss,
+            primitive_matching_loss=primitive_matching_loss,
+            primitive_sem_matching_loss=primitive_sem_matching_loss,
+            primitive_centroid_reg_loss=primitive_centroid_reg_loss)
+
+        losses.update(refined_loss)
+
+        return losses
+
+    def get_bboxes(self,
+                   points,
+                   bbox_preds,
+                   input_metas,
+                   rescale=False,
+                   suffix=''):
+        """Generate bboxes from vote head predictions.
+
+        Args:
+            points (torch.Tensor): Input points.
+            bbox_preds (dict): Predictions from vote head.
+            input_metas (list[dict]): Point cloud and image's meta info.
+            rescale (bool): Whether to rescale bboxes.
+
+        Returns:
+            list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
+        """
+        # decode boxes
+        obj_scores = F.softmax(
+            bbox_preds['obj_scores' + suffix], dim=-1)[..., -1]
+
+        sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1)
+
+        prediction_collection = {}
+        prediction_collection['center'] = bbox_preds['center' + suffix]
+        prediction_collection['dir_class'] = bbox_preds['dir_class']
+        prediction_collection['dir_res'] = bbox_preds['dir_res' + suffix]
+        prediction_collection['size_class'] = bbox_preds['size_class']
+        prediction_collection['size_res'] = bbox_preds['size_res' + suffix]
+
+        bbox3d = self.bbox_coder.decode(prediction_collection)
+
+        batch_size = bbox3d.shape[0]
+        results = list()
+        for b in range(batch_size):
+            bbox_selected, score_selected, labels = self.multiclass_nms_single(
+                obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3],
+                input_metas[b])
+            bbox = input_metas[b]['box_type_3d'](
+                bbox_selected,
+                box_dim=bbox_selected.shape[-1],
+                with_yaw=self.bbox_coder.with_rot)
+            results.append((bbox, score_selected, labels))
+
+        return results
+
+    def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
+                              input_meta):
+        """Multi-class nms in single batch.
+
+        Args:
+            obj_scores (torch.Tensor): Objectness score of bounding boxes.
+            sem_scores (torch.Tensor): semantic class score of bounding boxes.
+            bbox (torch.Tensor): Predicted bounding boxes.
+            points (torch.Tensor): Input points.
+            input_meta (dict): Point cloud and image's meta info.
+
+        Returns:
+            tuple[torch.Tensor]: Bounding boxes, scores and labels.
+        """
+        bbox = input_meta['box_type_3d'](
+            bbox,
+            box_dim=bbox.shape[-1],
+            with_yaw=self.bbox_coder.with_rot,
+            origin=(0.5, 0.5, 0.5))
+        box_indices = bbox.points_in_boxes_all(points)
+
+        corner3d = bbox.corners
+        minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
+        minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
+        minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
+
+        nonempty_box_mask = box_indices.T.sum(1) > 5
+
+        bbox_classes = torch.argmax(sem_scores, -1)
+        nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
+                                      obj_scores[nonempty_box_mask],
+                                      bbox_classes[nonempty_box_mask],
+                                      self.test_cfg.nms_thr)
+
+        # filter empty boxes and boxes with low score
+        scores_mask = (obj_scores > self.test_cfg.score_thr)
+        nonempty_box_inds = torch.nonzero(
+            nonempty_box_mask, as_tuple=False).flatten()
+        nonempty_mask = torch.zeros_like(bbox_classes).scatter(
+            0, nonempty_box_inds[nms_selected], 1)
+        selected = (nonempty_mask.bool() & scores_mask.bool())
+
+        if self.test_cfg.per_class_proposal:
+            bbox_selected, score_selected, labels = [], [], []
+            for k in range(sem_scores.shape[-1]):
+                bbox_selected.append(bbox[selected].tensor)
+                score_selected.append(obj_scores[selected] *
+                                      sem_scores[selected][:, k])
+                labels.append(
+                    torch.zeros_like(bbox_classes[selected]).fill_(k))
+            bbox_selected = torch.cat(bbox_selected, 0)
+            score_selected = torch.cat(score_selected, 0)
+            labels = torch.cat(labels, 0)
+        else:
+            bbox_selected = bbox[selected].tensor
+            score_selected = obj_scores[selected]
+            labels = bbox_classes[selected]
+
+        return bbox_selected, score_selected, labels
+
+    def get_proposal_stage_loss(self,
+                                bbox_preds,
+                                size_class_targets,
+                                size_res_targets,
+                                dir_class_targets,
+                                dir_res_targets,
+                                center_targets,
+                                mask_targets,
+                                objectness_targets,
+                                objectness_weights,
+                                box_loss_weights,
+                                valid_gt_weights,
+                                suffix=''):
+        """Compute loss for the aggregation module.
+
+        Args:
+            bbox_preds (dict): Predictions from forward of vote head.
+            size_class_targets (torch.Tensor): Ground truth
+                size class of each prediction bounding box.
+            size_res_targets (torch.Tensor): Ground truth
+                size residual of each prediction bounding box.
+            dir_class_targets (torch.Tensor): Ground truth
+                direction class of each prediction bounding box.
+            dir_res_targets (torch.Tensor): Ground truth
+                direction residual of each prediction bounding box.
+            center_targets (torch.Tensor): Ground truth center
+                of each prediction bounding box.
+            mask_targets (torch.Tensor): Validation of each
+                prediction bounding box.
+            objectness_targets (torch.Tensor): Ground truth
+                objectness label of each prediction bounding box.
+            objectness_weights (torch.Tensor): Weights of objectness
+                loss for each prediction bounding box.
+            box_loss_weights (torch.Tensor): Weights of regression
+                loss for each prediction bounding box.
+            valid_gt_weights (torch.Tensor): Validation of each
+                ground truth bounding box.
+
+        Returns:
+            dict: Losses of aggregation module.
+        """
+        # calculate objectness loss
+        objectness_loss = self.objectness_loss(
+            bbox_preds['obj_scores' + suffix].transpose(2, 1),
+            objectness_targets,
+            weight=objectness_weights)
+
+        # calculate center loss
+        source2target_loss, target2source_loss = self.center_loss(
+            bbox_preds['center' + suffix],
+            center_targets,
+            src_weight=box_loss_weights,
+            dst_weight=valid_gt_weights)
+        center_loss = source2target_loss + target2source_loss
+
+        # calculate direction class loss
+        dir_class_loss = self.dir_class_loss(
+            bbox_preds['dir_class' + suffix].transpose(2, 1),
+            dir_class_targets,
+            weight=box_loss_weights)
+
+        # calculate direction residual loss
+        batch_size, proposal_num = size_class_targets.shape[:2]
+        heading_label_one_hot = dir_class_targets.new_zeros(
+            (batch_size, proposal_num, self.num_dir_bins))
+        heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
+        dir_res_norm = (bbox_preds['dir_res_norm' + suffix] *
+                        heading_label_one_hot).sum(dim=-1)
+        dir_res_loss = self.dir_res_loss(
+            dir_res_norm, dir_res_targets, weight=box_loss_weights)
+
+        # calculate size class loss
+        size_class_loss = self.size_class_loss(
+            bbox_preds['size_class' + suffix].transpose(2, 1),
+            size_class_targets,
+            weight=box_loss_weights)
+
+        # calculate size residual loss
+        one_hot_size_targets = box_loss_weights.new_zeros(
+            (batch_size, proposal_num, self.num_sizes))
+        one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1)
+        one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
+            -1).repeat(1, 1, 1, 3)
+        size_residual_norm = (bbox_preds['size_res_norm' + suffix] *
+                              one_hot_size_targets_expand).sum(dim=2)
+        box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat(
+            1, 1, 3)
+        size_res_loss = self.size_res_loss(
+            size_residual_norm,
+            size_res_targets,
+            weight=box_loss_weights_expand)
+
+        # calculate semantic loss
+        semantic_loss = self.semantic_loss(
+            bbox_preds['sem_scores' + suffix].transpose(2, 1),
+            mask_targets,
+            weight=box_loss_weights)
+
+        losses = dict(
+            objectness_loss=objectness_loss,
+            semantic_loss=semantic_loss,
+            center_loss=center_loss,
+            dir_class_loss=dir_class_loss,
+            dir_res_loss=dir_res_loss,
+            size_class_loss=size_class_loss,
+            size_res_loss=size_res_loss)
+
+        return losses
+
+    def get_targets(self,
+                    points,
+                    gt_bboxes_3d,
+                    gt_labels_3d,
+                    pts_semantic_mask=None,
+                    pts_instance_mask=None,
+                    bbox_preds=None):
+        """Generate targets of proposal module.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each batch.
+            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
+            pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (list[torch.Tensor]): Point-wise instance
+                label of each batch.
+            bbox_preds (torch.Tensor): Bounding box predictions of vote head.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of proposal module.
+        """
+        # find empty example
+        valid_gt_masks = list()
+        gt_num = list()
+        for index in range(len(gt_labels_3d)):
+            if len(gt_labels_3d[index]) == 0:
+                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
+                    1, gt_bboxes_3d[index].tensor.shape[-1])
+                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
+                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
+                valid_gt_masks.append(gt_labels_3d[index].new_zeros(1))
+                gt_num.append(1)
+            else:
+                valid_gt_masks.append(gt_labels_3d[index].new_ones(
+                    gt_labels_3d[index].shape))
+                gt_num.append(gt_labels_3d[index].shape[0])
+
+        if pts_semantic_mask is None:
+            pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
+            pts_instance_mask = [None for i in range(len(gt_labels_3d))]
+
+        aggregated_points = [
+            bbox_preds['aggregated_points'][i]
+            for i in range(len(gt_labels_3d))
+        ]
+
+        surface_center_pred = [
+            bbox_preds['surface_center_pred'][i]
+            for i in range(len(gt_labels_3d))
+        ]
+
+        line_center_pred = [
+            bbox_preds['pred_line_center'][i]
+            for i in range(len(gt_labels_3d))
+        ]
+
+        surface_center_object = [
+            bbox_preds['surface_center_object'][i]
+            for i in range(len(gt_labels_3d))
+        ]
+
+        line_center_object = [
+            bbox_preds['line_center_object'][i]
+            for i in range(len(gt_labels_3d))
+        ]
+
+        surface_sem_pred = [
+            bbox_preds['surface_sem_pred'][i]
+            for i in range(len(gt_labels_3d))
+        ]
+
+        line_sem_pred = [
+            bbox_preds['sem_cls_scores_line'][i]
+            for i in range(len(gt_labels_3d))
+        ]
+
+        (cues_objectness_label, cues_sem_label, proposal_objectness_label,
+         cues_mask, cues_match_mask, proposal_objectness_mask,
+         cues_matching_label, obj_surface_line_center) = multi_apply(
+             self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d,
+             pts_semantic_mask, pts_instance_mask, aggregated_points,
+             surface_center_pred, line_center_pred, surface_center_object,
+             line_center_object, surface_sem_pred, line_sem_pred)
+
+        cues_objectness_label = torch.stack(cues_objectness_label)
+        cues_sem_label = torch.stack(cues_sem_label)
+        proposal_objectness_label = torch.stack(proposal_objectness_label)
+        cues_mask = torch.stack(cues_mask)
+        cues_match_mask = torch.stack(cues_match_mask)
+        proposal_objectness_mask = torch.stack(proposal_objectness_mask)
+        cues_matching_label = torch.stack(cues_matching_label)
+        obj_surface_line_center = torch.stack(obj_surface_line_center)
+
+        return (cues_objectness_label, cues_sem_label,
+                proposal_objectness_label, cues_mask, cues_match_mask,
+                proposal_objectness_mask, cues_matching_label,
+                obj_surface_line_center)
+
+    def get_targets_single(self,
+                           points,
+                           gt_bboxes_3d,
+                           gt_labels_3d,
+                           pts_semantic_mask=None,
+                           pts_instance_mask=None,
+                           aggregated_points=None,
+                           pred_surface_center=None,
+                           pred_line_center=None,
+                           pred_obj_surface_center=None,
+                           pred_obj_line_center=None,
+                           pred_surface_sem=None,
+                           pred_line_sem=None):
+        """Generate targets for primitive cues for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+            pts_semantic_mask (torch.Tensor): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (torch.Tensor): Point-wise instance
+                label of each batch.
+            aggregated_points (torch.Tensor): Aggregated points from
+                vote aggregation layer.
+            pred_surface_center (torch.Tensor): Prediction of surface center.
+            pred_line_center (torch.Tensor): Prediction of line center.
+            pred_obj_surface_center (torch.Tensor): Objectness prediction
+                of surface center.
+            pred_obj_line_center (torch.Tensor): Objectness prediction of
+                line center.
+            pred_surface_sem (torch.Tensor): Semantic prediction of
+                surface center.
+            pred_line_sem (torch.Tensor): Semantic prediction of line center.
+        Returns:
+            tuple[torch.Tensor]: Targets for primitive cues.
+        """
+        device = points.device
+        gt_bboxes_3d = gt_bboxes_3d.to(device)
+        num_proposals = aggregated_points.shape[0]
+        gt_center = gt_bboxes_3d.gravity_center
+
+        dist1, dist2, ind1, _ = chamfer_distance(
+            aggregated_points.unsqueeze(0),
+            gt_center.unsqueeze(0),
+            reduction='none')
+        # Set assignment
+        object_assignment = ind1.squeeze(0)
+
+        # Generate objectness label and mask
+        # objectness_label: 1 if pred object center is within
+        # self.train_cfg['near_threshold'] of any GT object
+        # objectness_mask: 0 if pred object center is in gray
+        # zone (DONOTCARE), 1 otherwise
+        euclidean_dist1 = torch.sqrt(dist1.squeeze(0) + 1e-6)
+        proposal_objectness_label = euclidean_dist1.new_zeros(
+            num_proposals, dtype=torch.long)
+        proposal_objectness_mask = euclidean_dist1.new_zeros(num_proposals)
+
+        gt_sem = gt_labels_3d[object_assignment]
+
+        obj_surface_center, obj_line_center = \
+            gt_bboxes_3d.get_surface_line_center()
+        obj_surface_center = obj_surface_center.reshape(-1, 6,
+                                                        3).transpose(0, 1)
+        obj_line_center = obj_line_center.reshape(-1, 12, 3).transpose(0, 1)
+        obj_surface_center = obj_surface_center[:, object_assignment].reshape(
+            1, -1, 3)
+        obj_line_center = obj_line_center[:,
+                                          object_assignment].reshape(1, -1, 3)
+
+        surface_sem = torch.argmax(pred_surface_sem, dim=1).float()
+        line_sem = torch.argmax(pred_line_sem, dim=1).float()
+
+        dist_surface, _, surface_ind, _ = chamfer_distance(
+            obj_surface_center,
+            pred_surface_center.unsqueeze(0),
+            reduction='none')
+        dist_line, _, line_ind, _ = chamfer_distance(
+            obj_line_center, pred_line_center.unsqueeze(0), reduction='none')
+
+        surface_sel = pred_surface_center[surface_ind.squeeze(0)]
+        line_sel = pred_line_center[line_ind.squeeze(0)]
+        surface_sel_sem = surface_sem[surface_ind.squeeze(0)]
+        line_sel_sem = line_sem[line_ind.squeeze(0)]
+
+        surface_sel_sem_gt = gt_sem.repeat(6).float()
+        line_sel_sem_gt = gt_sem.repeat(12).float()
+
+        euclidean_dist_surface = torch.sqrt(dist_surface.squeeze(0) + 1e-6)
+        euclidean_dist_line = torch.sqrt(dist_line.squeeze(0) + 1e-6)
+        objectness_label_surface = euclidean_dist_line.new_zeros(
+            num_proposals * 6, dtype=torch.long)
+        objectness_mask_surface = euclidean_dist_line.new_zeros(num_proposals *
+                                                                6)
+        objectness_label_line = euclidean_dist_line.new_zeros(
+            num_proposals * 12, dtype=torch.long)
+        objectness_mask_line = euclidean_dist_line.new_zeros(num_proposals *
+                                                             12)
+        objectness_label_surface_sem = euclidean_dist_line.new_zeros(
+            num_proposals * 6, dtype=torch.long)
+        objectness_label_line_sem = euclidean_dist_line.new_zeros(
+            num_proposals * 12, dtype=torch.long)
+
+        euclidean_dist_obj_surface = torch.sqrt((
+            (pred_obj_surface_center - surface_sel)**2).sum(dim=-1) + 1e-6)
+        euclidean_dist_obj_line = torch.sqrt(
+            torch.sum((pred_obj_line_center - line_sel)**2, dim=-1) + 1e-6)
+
+        # Objectness score just with centers
+        proposal_objectness_label[
+            euclidean_dist1 < self.train_cfg['near_threshold']] = 1
+        proposal_objectness_mask[
+            euclidean_dist1 < self.train_cfg['near_threshold']] = 1
+        proposal_objectness_mask[
+            euclidean_dist1 > self.train_cfg['far_threshold']] = 1
+
+        objectness_label_surface[
+            (euclidean_dist_obj_surface <
+             self.train_cfg['label_surface_threshold']) *
+            (euclidean_dist_surface <
+             self.train_cfg['mask_surface_threshold'])] = 1
+        objectness_label_surface_sem[
+            (euclidean_dist_obj_surface <
+             self.train_cfg['label_surface_threshold']) *
+            (euclidean_dist_surface < self.train_cfg['mask_surface_threshold'])
+            * (surface_sel_sem == surface_sel_sem_gt)] = 1
+
+        objectness_label_line[
+            (euclidean_dist_obj_line < self.train_cfg['label_line_threshold'])
+            *
+            (euclidean_dist_line < self.train_cfg['mask_line_threshold'])] = 1
+        objectness_label_line_sem[
+            (euclidean_dist_obj_line < self.train_cfg['label_line_threshold'])
+            * (euclidean_dist_line < self.train_cfg['mask_line_threshold']) *
+            (line_sel_sem == line_sel_sem_gt)] = 1
+
+        objectness_label_surface_obj = proposal_objectness_label.repeat(6)
+        objectness_mask_surface_obj = proposal_objectness_mask.repeat(6)
+        objectness_label_line_obj = proposal_objectness_label.repeat(12)
+        objectness_mask_line_obj = proposal_objectness_mask.repeat(12)
+
+        objectness_mask_surface = objectness_mask_surface_obj
+        objectness_mask_line = objectness_mask_line_obj
+
+        cues_objectness_label = torch.cat(
+            (objectness_label_surface, objectness_label_line), 0)
+        cues_sem_label = torch.cat(
+            (objectness_label_surface_sem, objectness_label_line_sem), 0)
+        cues_mask = torch.cat((objectness_mask_surface, objectness_mask_line),
+                              0)
+
+        objectness_label_surface *= objectness_label_surface_obj
+        objectness_label_line *= objectness_label_line_obj
+        cues_matching_label = torch.cat(
+            (objectness_label_surface, objectness_label_line), 0)
+
+        objectness_label_surface_sem *= objectness_label_surface_obj
+        objectness_label_line_sem *= objectness_label_line_obj
+
+        cues_match_mask = (torch.sum(
+            cues_objectness_label.view(18, num_proposals), dim=0) >=
+                           1).float()
+
+        obj_surface_line_center = torch.cat(
+            (obj_surface_center, obj_line_center), 1).squeeze(0)
+
+        return (cues_objectness_label, cues_sem_label,
+                proposal_objectness_label, cues_mask, cues_match_mask,
+                proposal_objectness_mask, cues_matching_label,
+                obj_surface_line_center)
diff --git a/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py b/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
index 6f5ea72..9a377ab 100644
--- a/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
@@ -1,629 +1,629 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from mmcv.cnn import ConvModule, normal_init
-
-from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE
-
-if IS_SPCONV2_AVAILABLE:
-    from spconv.pytorch import (SparseConvTensor, SparseMaxPool3d,
-                                SparseSequential)
-else:
-    from mmcv.ops import SparseConvTensor, SparseMaxPool3d, SparseSequential
-
-from mmcv.runner import BaseModule
-from torch import nn as nn
-
-from mmdet3d.core.bbox.structures import (LiDARInstance3DBoxes,
-                                          rotation_3d_in_axis, xywhr2xyxyr)
-from mmdet3d.core.post_processing import nms_bev, nms_normal_bev
-from mmdet3d.models.builder import HEADS, build_loss
-from mmdet3d.ops import make_sparse_convmodule
-from mmdet.core import build_bbox_coder, multi_apply
-
-
-@HEADS.register_module()
-class PartA2BboxHead(BaseModule):
-    """PartA2 RoI head.
-
-    Args:
-        num_classes (int): The number of classes to prediction.
-        seg_in_channels (int): Input channels of segmentation
-            convolution layer.
-        part_in_channels (int): Input channels of part convolution layer.
-        seg_conv_channels (list(int)): Out channels of each
-            segmentation convolution layer.
-        part_conv_channels (list(int)): Out channels of each
-            part convolution layer.
-        merge_conv_channels (list(int)): Out channels of each
-            feature merged convolution layer.
-        down_conv_channels (list(int)): Out channels of each
-            downsampled convolution layer.
-        shared_fc_channels (list(int)): Out channels of each shared fc layer.
-        cls_channels (list(int)): Out channels of each classification layer.
-        reg_channels (list(int)): Out channels of each regression layer.
-        dropout_ratio (float): Dropout ratio of classification and
-            regression layers.
-        roi_feat_size (int): The size of pooled roi features.
-        with_corner_loss (bool): Whether to use corner loss or not.
-        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for box head.
-        conv_cfg (dict): Config dict of convolutional layers
-        norm_cfg (dict): Config dict of normalization layers
-        loss_bbox (dict): Config dict of box regression loss.
-        loss_cls (dict): Config dict of classifacation loss.
-    """
-
-    def __init__(self,
-                 num_classes,
-                 seg_in_channels,
-                 part_in_channels,
-                 seg_conv_channels=None,
-                 part_conv_channels=None,
-                 merge_conv_channels=None,
-                 down_conv_channels=None,
-                 shared_fc_channels=None,
-                 cls_channels=None,
-                 reg_channels=None,
-                 dropout_ratio=0.1,
-                 roi_feat_size=14,
-                 with_corner_loss=True,
-                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 loss_bbox=dict(
-                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
-                 loss_cls=dict(
-                     type='CrossEntropyLoss',
-                     use_sigmoid=True,
-                     reduction='none',
-                     loss_weight=1.0),
-                 init_cfg=None):
-        super(PartA2BboxHead, self).__init__(init_cfg=init_cfg)
-        self.num_classes = num_classes
-        self.with_corner_loss = with_corner_loss
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-        self.loss_bbox = build_loss(loss_bbox)
-        self.loss_cls = build_loss(loss_cls)
-        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
-
-        assert down_conv_channels[-1] == shared_fc_channels[0]
-
-        # init layers
-        part_channel_last = part_in_channels
-        part_conv = []
-        for i, channel in enumerate(part_conv_channels):
-            part_conv.append(
-                make_sparse_convmodule(
-                    part_channel_last,
-                    channel,
-                    3,
-                    padding=1,
-                    norm_cfg=norm_cfg,
-                    indice_key=f'rcnn_part{i}',
-                    conv_type='SubMConv3d'))
-            part_channel_last = channel
-        self.part_conv = SparseSequential(*part_conv)
-
-        seg_channel_last = seg_in_channels
-        seg_conv = []
-        for i, channel in enumerate(seg_conv_channels):
-            seg_conv.append(
-                make_sparse_convmodule(
-                    seg_channel_last,
-                    channel,
-                    3,
-                    padding=1,
-                    norm_cfg=norm_cfg,
-                    indice_key=f'rcnn_seg{i}',
-                    conv_type='SubMConv3d'))
-            seg_channel_last = channel
-        self.seg_conv = SparseSequential(*seg_conv)
-
-        self.conv_down = SparseSequential()
-
-        merge_conv_channel_last = part_channel_last + seg_channel_last
-        merge_conv = []
-        for i, channel in enumerate(merge_conv_channels):
-            merge_conv.append(
-                make_sparse_convmodule(
-                    merge_conv_channel_last,
-                    channel,
-                    3,
-                    padding=1,
-                    norm_cfg=norm_cfg,
-                    indice_key='rcnn_down0'))
-            merge_conv_channel_last = channel
-
-        down_conv_channel_last = merge_conv_channel_last
-        conv_down = []
-        for i, channel in enumerate(down_conv_channels):
-            conv_down.append(
-                make_sparse_convmodule(
-                    down_conv_channel_last,
-                    channel,
-                    3,
-                    padding=1,
-                    norm_cfg=norm_cfg,
-                    indice_key='rcnn_down1'))
-            down_conv_channel_last = channel
-
-        self.conv_down.add_module('merge_conv', SparseSequential(*merge_conv))
-        self.conv_down.add_module('max_pool3d',
-                                  SparseMaxPool3d(kernel_size=2, stride=2))
-        self.conv_down.add_module('down_conv', SparseSequential(*conv_down))
-
-        shared_fc_list = []
-        pool_size = roi_feat_size // 2
-        pre_channel = shared_fc_channels[0] * pool_size**3
-        for k in range(1, len(shared_fc_channels)):
-            shared_fc_list.append(
-                ConvModule(
-                    pre_channel,
-                    shared_fc_channels[k],
-                    1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    inplace=True))
-            pre_channel = shared_fc_channels[k]
-
-            if k != len(shared_fc_channels) - 1 and dropout_ratio > 0:
-                shared_fc_list.append(nn.Dropout(dropout_ratio))
-
-        self.shared_fc = nn.Sequential(*shared_fc_list)
-
-        # Classification layer
-        channel_in = shared_fc_channels[-1]
-        cls_channel = 1
-        cls_layers = []
-        pre_channel = channel_in
-        for k in range(0, len(cls_channels)):
-            cls_layers.append(
-                ConvModule(
-                    pre_channel,
-                    cls_channels[k],
-                    1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    inplace=True))
-            pre_channel = cls_channels[k]
-        cls_layers.append(
-            ConvModule(
-                pre_channel,
-                cls_channel,
-                1,
-                padding=0,
-                conv_cfg=conv_cfg,
-                act_cfg=None))
-        if dropout_ratio >= 0:
-            cls_layers.insert(1, nn.Dropout(dropout_ratio))
-
-        self.conv_cls = nn.Sequential(*cls_layers)
-
-        # Regression layer
-        reg_layers = []
-        pre_channel = channel_in
-        for k in range(0, len(reg_channels)):
-            reg_layers.append(
-                ConvModule(
-                    pre_channel,
-                    reg_channels[k],
-                    1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    inplace=True))
-            pre_channel = reg_channels[k]
-        reg_layers.append(
-            ConvModule(
-                pre_channel,
-                self.bbox_coder.code_size,
-                1,
-                padding=0,
-                conv_cfg=conv_cfg,
-                act_cfg=None))
-        if dropout_ratio >= 0:
-            reg_layers.insert(1, nn.Dropout(dropout_ratio))
-
-        self.conv_reg = nn.Sequential(*reg_layers)
-
-        if init_cfg is None:
-            self.init_cfg = dict(
-                type='Xavier',
-                layer=['Conv2d', 'Conv1d'],
-                distribution='uniform')
-
-    def init_weights(self):
-        super().init_weights()
-        normal_init(self.conv_reg[-1].conv, mean=0, std=0.001)
-
-    def forward(self, seg_feats, part_feats):
-        """Forward pass.
-
-        Args:
-            seg_feats (torch.Tensor): Point-wise semantic features.
-            part_feats (torch.Tensor): Point-wise part prediction features.
-
-        Returns:
-            tuple[torch.Tensor]: Score of class and bbox predictions.
-        """
-        # (B * N, out_x, out_y, out_z, 4)
-        rcnn_batch_size = part_feats.shape[0]
-
-        # transform to sparse tensors
-        sparse_shape = part_feats.shape[1:4]
-        # (non_empty_num, 4) ==> [bs_idx, x_idx, y_idx, z_idx]
-        sparse_idx = part_feats.sum(dim=-1).nonzero(as_tuple=False)
-
-        part_features = part_feats[sparse_idx[:, 0], sparse_idx[:, 1],
-                                   sparse_idx[:, 2], sparse_idx[:, 3]]
-        seg_features = seg_feats[sparse_idx[:, 0], sparse_idx[:, 1],
-                                 sparse_idx[:, 2], sparse_idx[:, 3]]
-        coords = sparse_idx.int().contiguous()
-        part_features = SparseConvTensor(part_features, coords, sparse_shape,
-                                         rcnn_batch_size)
-        seg_features = SparseConvTensor(seg_features, coords, sparse_shape,
-                                        rcnn_batch_size)
-
-        # forward rcnn network
-        x_part = self.part_conv(part_features)
-        x_rpn = self.seg_conv(seg_features)
-
-        merged_feature = torch.cat((x_rpn.features, x_part.features),
-                                   dim=1)  # (N, C)
-        shared_feature = SparseConvTensor(merged_feature, coords, sparse_shape,
-                                          rcnn_batch_size)
-
-        x = self.conv_down(shared_feature)
-
-        shared_feature = x.dense().view(rcnn_batch_size, -1, 1)
-
-        shared_feature = self.shared_fc(shared_feature)
-
-        cls_score = self.conv_cls(shared_feature).transpose(
-            1, 2).contiguous().squeeze(dim=1)  # (B, 1)
-        bbox_pred = self.conv_reg(shared_feature).transpose(
-            1, 2).contiguous().squeeze(dim=1)  # (B, C)
-
-        return cls_score, bbox_pred
-
-    def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets,
-             pos_gt_bboxes, reg_mask, label_weights, bbox_weights):
-        """Computing losses.
-
-        Args:
-            cls_score (torch.Tensor): Scores of each roi.
-            bbox_pred (torch.Tensor): Predictions of bboxes.
-            rois (torch.Tensor): Roi bboxes.
-            labels (torch.Tensor): Labels of class.
-            bbox_targets (torch.Tensor): Target of positive bboxes.
-            pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes.
-            reg_mask (torch.Tensor): Mask for positive bboxes.
-            label_weights (torch.Tensor): Weights of class loss.
-            bbox_weights (torch.Tensor): Weights of bbox loss.
-
-        Returns:
-            dict: Computed losses.
-
-                - loss_cls (torch.Tensor): Loss of classes.
-                - loss_bbox (torch.Tensor): Loss of bboxes.
-                - loss_corner (torch.Tensor): Loss of corners.
-        """
-        losses = dict()
-        rcnn_batch_size = cls_score.shape[0]
-
-        # calculate class loss
-        cls_flat = cls_score.view(-1)
-        loss_cls = self.loss_cls(cls_flat, labels, label_weights)
-        losses['loss_cls'] = loss_cls
-
-        # calculate regression loss
-        code_size = self.bbox_coder.code_size
-        pos_inds = (reg_mask > 0)
-        if pos_inds.any() == 0:
-            # fake a part loss
-            losses['loss_bbox'] = loss_cls.new_tensor(0)
-            if self.with_corner_loss:
-                losses['loss_corner'] = loss_cls.new_tensor(0)
-        else:
-            pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds]
-            bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat(
-                1, pos_bbox_pred.shape[-1])
-            loss_bbox = self.loss_bbox(
-                pos_bbox_pred.unsqueeze(dim=0), bbox_targets.unsqueeze(dim=0),
-                bbox_weights_flat.unsqueeze(dim=0))
-            losses['loss_bbox'] = loss_bbox
-
-            if self.with_corner_loss:
-                pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds]
-                pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size)
-                batch_anchors = pos_roi_boxes3d.clone().detach()
-                pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1)
-                roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3)
-                batch_anchors[..., 0:3] = 0
-                # decode boxes
-                pred_boxes3d = self.bbox_coder.decode(
-                    batch_anchors,
-                    pos_bbox_pred.view(-1, code_size)).view(-1, code_size)
-
-                pred_boxes3d[..., 0:3] = rotation_3d_in_axis(
-                    pred_boxes3d[..., 0:3].unsqueeze(1),
-                    pos_rois_rotation,
-                    axis=2).squeeze(1)
-
-                pred_boxes3d[:, 0:3] += roi_xyz
-
-                # calculate corner loss
-                loss_corner = self.get_corner_loss_lidar(
-                    pred_boxes3d, pos_gt_bboxes)
-                losses['loss_corner'] = loss_corner
-
-        return losses
-
-    def get_targets(self, sampling_results, rcnn_train_cfg, concat=True):
-        """Generate targets.
-
-        Args:
-            sampling_results (list[:obj:`SamplingResult`]):
-                Sampled results from rois.
-            rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn.
-            concat (bool): Whether to concatenate targets between batches.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of boxes and class prediction.
-        """
-        pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
-        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
-        iou_list = [res.iou for res in sampling_results]
-        targets = multi_apply(
-            self._get_target_single,
-            pos_bboxes_list,
-            pos_gt_bboxes_list,
-            iou_list,
-            cfg=rcnn_train_cfg)
-
-        (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
-         bbox_weights) = targets
-
-        if concat:
-            label = torch.cat(label, 0)
-            bbox_targets = torch.cat(bbox_targets, 0)
-            pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0)
-            reg_mask = torch.cat(reg_mask, 0)
-
-            label_weights = torch.cat(label_weights, 0)
-            label_weights /= torch.clamp(label_weights.sum(), min=1.0)
-
-            bbox_weights = torch.cat(bbox_weights, 0)
-            bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0)
-
-        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
-                bbox_weights)
-
-    def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg):
-        """Generate training targets for a single sample.
-
-        Args:
-            pos_bboxes (torch.Tensor): Positive boxes with shape
-                (N, 7).
-            pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape
-                (M, 7).
-            ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes`
-                in shape (N, M).
-            cfg (dict): Training configs.
-
-        Returns:
-            tuple[torch.Tensor]: Target for positive boxes.
-                (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
-                bbox_weights)
-        """
-        cls_pos_mask = ious > cfg.cls_pos_thr
-        cls_neg_mask = ious < cfg.cls_neg_thr
-        interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0)
-
-        # iou regression target
-        label = (cls_pos_mask > 0).float()
-        label[interval_mask] = ious[interval_mask] * 2 - 0.5
-        # label weights
-        label_weights = (label >= 0).float()
-
-        # box regression target
-        reg_mask = pos_bboxes.new_zeros(ious.size(0)).long()
-        reg_mask[0:pos_gt_bboxes.size(0)] = 1
-        bbox_weights = (reg_mask > 0).float()
-        if reg_mask.bool().any():
-            pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach()
-            roi_center = pos_bboxes[..., 0:3]
-            roi_ry = pos_bboxes[..., 6] % (2 * np.pi)
-
-            # canonical transformation
-            pos_gt_bboxes_ct[..., 0:3] -= roi_center
-            pos_gt_bboxes_ct[..., 6] -= roi_ry
-            pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis(
-                pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -roi_ry,
-                axis=2).squeeze(1)
-
-            # flip orientation if rois have opposite orientation
-            ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi)  # 0 ~ 2pi
-            opposite_flag = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5)
-            ry_label[opposite_flag] = (ry_label[opposite_flag] + np.pi) % (
-                2 * np.pi)  # (0 ~ pi/2, 3pi/2 ~ 2pi)
-            flag = ry_label > np.pi
-            ry_label[flag] = ry_label[flag] - np.pi * 2  # (-pi/2, pi/2)
-            ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2)
-            pos_gt_bboxes_ct[..., 6] = ry_label
-
-            rois_anchor = pos_bboxes.clone().detach()
-            rois_anchor[:, 0:3] = 0
-            rois_anchor[:, 6] = 0
-            bbox_targets = self.bbox_coder.encode(rois_anchor,
-                                                  pos_gt_bboxes_ct)
-        else:
-            # no fg bbox
-            bbox_targets = pos_gt_bboxes.new_empty((0, 7))
-
-        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
-                bbox_weights)
-
-    def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1.0):
-        """Calculate corner loss of given boxes.
-
-        Args:
-            pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7).
-            gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7).
-            delta (float, optional): huber loss threshold. Defaults to 1.0
-
-        Returns:
-            torch.FloatTensor: Calculated corner loss in shape (N).
-        """
-        assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0]
-
-        # This is a little bit hack here because we assume the box for
-        # Part-A2 is in LiDAR coordinates
-        gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d)
-        pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners
-        gt_box_corners = gt_boxes_structure.corners
-
-        # This flip only changes the heading direction of GT boxes
-        gt_bbox3d_flip = gt_boxes_structure.clone()
-        gt_bbox3d_flip.tensor[:, 6] += np.pi
-        gt_box_corners_flip = gt_bbox3d_flip.corners
-
-        corner_dist = torch.min(
-            torch.norm(pred_box_corners - gt_box_corners, dim=2),
-            torch.norm(pred_box_corners - gt_box_corners_flip,
-                       dim=2))  # (N, 8)
-        # huber loss
-        abs_error = corner_dist.abs()
-        quadratic = abs_error.clamp(max=delta)
-        linear = (abs_error - quadratic)
-        corner_loss = 0.5 * quadratic**2 + delta * linear
-
-        return corner_loss.mean(dim=1)
-
-    def get_bboxes(self,
-                   rois,
-                   cls_score,
-                   bbox_pred,
-                   class_labels,
-                   class_pred,
-                   img_metas,
-                   cfg=None):
-        """Generate bboxes from bbox head predictions.
-
-        Args:
-            rois (torch.Tensor): Roi bounding boxes.
-            cls_score (torch.Tensor): Scores of bounding boxes.
-            bbox_pred (torch.Tensor): Bounding boxes predictions
-            class_labels (torch.Tensor): Label of classes
-            class_pred (torch.Tensor): Score for nms.
-            img_metas (list[dict]): Point cloud and image's meta info.
-            cfg (:obj:`ConfigDict`): Testing config.
-
-        Returns:
-            list[tuple]: Decoded bbox, scores and labels after nms.
-        """
-        roi_batch_id = rois[..., 0]
-        roi_boxes = rois[..., 1:]  # boxes without batch id
-        batch_size = int(roi_batch_id.max().item() + 1)
-
-        # decode boxes
-        roi_ry = roi_boxes[..., 6].view(-1)
-        roi_xyz = roi_boxes[..., 0:3].view(-1, 3)
-        local_roi_boxes = roi_boxes.clone().detach()
-        local_roi_boxes[..., 0:3] = 0
-        rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred)
-        rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis(
-            rcnn_boxes3d[..., 0:3].unsqueeze(1), roi_ry, axis=2).squeeze(1)
-        rcnn_boxes3d[:, 0:3] += roi_xyz
-
-        # post processing
-        result_list = []
-        for batch_id in range(batch_size):
-            cur_class_labels = class_labels[batch_id]
-            cur_cls_score = cls_score[roi_batch_id == batch_id].view(-1)
-
-            cur_box_prob = class_pred[batch_id]
-            cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id]
-            keep = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d,
-                                        cfg.score_thr, cfg.nms_thr,
-                                        img_metas[batch_id],
-                                        cfg.use_rotate_nms)
-            selected_bboxes = cur_rcnn_boxes3d[keep]
-            selected_label_preds = cur_class_labels[keep]
-            selected_scores = cur_cls_score[keep]
-
-            result_list.append(
-                (img_metas[batch_id]['box_type_3d'](selected_bboxes,
-                                                    self.bbox_coder.code_size),
-                 selected_scores, selected_label_preds))
-        return result_list
-
-    def multi_class_nms(self,
-                        box_probs,
-                        box_preds,
-                        score_thr,
-                        nms_thr,
-                        input_meta,
-                        use_rotate_nms=True):
-        """Multi-class NMS for box head.
-
-        Note:
-            This function has large overlap with the `box3d_multiclass_nms`
-            implemented in `mmdet3d.core.post_processing`. We are considering
-            merging these two functions in the future.
-
-        Args:
-            box_probs (torch.Tensor): Predicted boxes probabitilies in
-                shape (N,).
-            box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C).
-            score_thr (float): Threshold of scores.
-            nms_thr (float): Threshold for NMS.
-            input_meta (dict): Meta information of the current sample.
-            use_rotate_nms (bool, optional): Whether to use rotated nms.
-                Defaults to True.
-
-        Returns:
-            torch.Tensor: Selected indices.
-        """
-        if use_rotate_nms:
-            nms_func = nms_bev
-        else:
-            nms_func = nms_normal_bev
-
-        assert box_probs.shape[
-            1] == self.num_classes, f'box_probs shape: {str(box_probs.shape)}'
-        selected_list = []
-        selected_labels = []
-        boxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
-            box_preds, self.bbox_coder.code_size).bev)
-
-        score_thresh = score_thr if isinstance(
-            score_thr, list) else [score_thr for x in range(self.num_classes)]
-        nms_thresh = nms_thr if isinstance(
-            nms_thr, list) else [nms_thr for x in range(self.num_classes)]
-        for k in range(0, self.num_classes):
-            class_scores_keep = box_probs[:, k] >= score_thresh[k]
-
-            if class_scores_keep.int().sum() > 0:
-                original_idxs = class_scores_keep.nonzero(
-                    as_tuple=False).view(-1)
-                cur_boxes_for_nms = boxes_for_nms[class_scores_keep]
-                cur_rank_scores = box_probs[class_scores_keep, k]
-
-                cur_selected = nms_func(cur_boxes_for_nms, cur_rank_scores,
-                                        nms_thresh[k])
-
-                if cur_selected.shape[0] == 0:
-                    continue
-                selected_list.append(original_idxs[cur_selected])
-                selected_labels.append(
-                    torch.full([cur_selected.shape[0]],
-                               k + 1,
-                               dtype=torch.int64,
-                               device=box_preds.device))
-
-        keep = torch.cat(
-            selected_list, dim=0) if len(selected_list) > 0 else []
-        return keep
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.cnn import ConvModule, normal_init
+
+from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import (SparseConvTensor, SparseMaxPool3d,
+                                SparseSequential)
+else:
+    from mmcv.ops import SparseConvTensor, SparseMaxPool3d, SparseSequential
+
+from mmcv.runner import BaseModule
+from torch import nn as nn
+
+from mmdet3d.core.bbox.structures import (LiDARInstance3DBoxes,
+                                          rotation_3d_in_axis, xywhr2xyxyr)
+from mmdet3d.core.post_processing import nms_bev, nms_normal_bev
+from mmdet3d.models.builder import HEADS, build_loss
+from mmdet3d.ops import make_sparse_convmodule
+from mmdet.core import build_bbox_coder, multi_apply
+
+
+@HEADS.register_module()
+class PartA2BboxHead(BaseModule):
+    """PartA2 RoI head.
+
+    Args:
+        num_classes (int): The number of classes to prediction.
+        seg_in_channels (int): Input channels of segmentation
+            convolution layer.
+        part_in_channels (int): Input channels of part convolution layer.
+        seg_conv_channels (list(int)): Out channels of each
+            segmentation convolution layer.
+        part_conv_channels (list(int)): Out channels of each
+            part convolution layer.
+        merge_conv_channels (list(int)): Out channels of each
+            feature merged convolution layer.
+        down_conv_channels (list(int)): Out channels of each
+            downsampled convolution layer.
+        shared_fc_channels (list(int)): Out channels of each shared fc layer.
+        cls_channels (list(int)): Out channels of each classification layer.
+        reg_channels (list(int)): Out channels of each regression layer.
+        dropout_ratio (float): Dropout ratio of classification and
+            regression layers.
+        roi_feat_size (int): The size of pooled roi features.
+        with_corner_loss (bool): Whether to use corner loss or not.
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for box head.
+        conv_cfg (dict): Config dict of convolutional layers
+        norm_cfg (dict): Config dict of normalization layers
+        loss_bbox (dict): Config dict of box regression loss.
+        loss_cls (dict): Config dict of classifacation loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 seg_in_channels,
+                 part_in_channels,
+                 seg_conv_channels=None,
+                 part_conv_channels=None,
+                 merge_conv_channels=None,
+                 down_conv_channels=None,
+                 shared_fc_channels=None,
+                 cls_channels=None,
+                 reg_channels=None,
+                 dropout_ratio=0.1,
+                 roi_feat_size=14,
+                 with_corner_loss=True,
+                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     reduction='none',
+                     loss_weight=1.0),
+                 init_cfg=None):
+        super(PartA2BboxHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.with_corner_loss = with_corner_loss
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_cls = build_loss(loss_cls)
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+
+        assert down_conv_channels[-1] == shared_fc_channels[0]
+
+        # init layers
+        part_channel_last = part_in_channels
+        part_conv = []
+        for i, channel in enumerate(part_conv_channels):
+            part_conv.append(
+                make_sparse_convmodule(
+                    part_channel_last,
+                    channel,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    indice_key=f'rcnn_part{i}',
+                    conv_type='SubMConv3d'))
+            part_channel_last = channel
+        self.part_conv = SparseSequential(*part_conv)
+
+        seg_channel_last = seg_in_channels
+        seg_conv = []
+        for i, channel in enumerate(seg_conv_channels):
+            seg_conv.append(
+                make_sparse_convmodule(
+                    seg_channel_last,
+                    channel,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    indice_key=f'rcnn_seg{i}',
+                    conv_type='SubMConv3d'))
+            seg_channel_last = channel
+        self.seg_conv = SparseSequential(*seg_conv)
+
+        self.conv_down = SparseSequential()
+
+        merge_conv_channel_last = part_channel_last + seg_channel_last
+        merge_conv = []
+        for i, channel in enumerate(merge_conv_channels):
+            merge_conv.append(
+                make_sparse_convmodule(
+                    merge_conv_channel_last,
+                    channel,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    indice_key='rcnn_down0'))
+            merge_conv_channel_last = channel
+
+        down_conv_channel_last = merge_conv_channel_last
+        conv_down = []
+        for i, channel in enumerate(down_conv_channels):
+            conv_down.append(
+                make_sparse_convmodule(
+                    down_conv_channel_last,
+                    channel,
+                    3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    indice_key='rcnn_down1'))
+            down_conv_channel_last = channel
+
+        self.conv_down.add_module('merge_conv', SparseSequential(*merge_conv))
+        self.conv_down.add_module('max_pool3d',
+                                  SparseMaxPool3d(kernel_size=2, stride=2))
+        self.conv_down.add_module('down_conv', SparseSequential(*conv_down))
+
+        shared_fc_list = []
+        pool_size = roi_feat_size // 2
+        pre_channel = shared_fc_channels[0] * pool_size**3
+        for k in range(1, len(shared_fc_channels)):
+            shared_fc_list.append(
+                ConvModule(
+                    pre_channel,
+                    shared_fc_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    inplace=True))
+            pre_channel = shared_fc_channels[k]
+
+            if k != len(shared_fc_channels) - 1 and dropout_ratio > 0:
+                shared_fc_list.append(nn.Dropout(dropout_ratio))
+
+        self.shared_fc = nn.Sequential(*shared_fc_list)
+
+        # Classification layer
+        channel_in = shared_fc_channels[-1]
+        cls_channel = 1
+        cls_layers = []
+        pre_channel = channel_in
+        for k in range(0, len(cls_channels)):
+            cls_layers.append(
+                ConvModule(
+                    pre_channel,
+                    cls_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    inplace=True))
+            pre_channel = cls_channels[k]
+        cls_layers.append(
+            ConvModule(
+                pre_channel,
+                cls_channel,
+                1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                act_cfg=None))
+        if dropout_ratio >= 0:
+            cls_layers.insert(1, nn.Dropout(dropout_ratio))
+
+        self.conv_cls = nn.Sequential(*cls_layers)
+
+        # Regression layer
+        reg_layers = []
+        pre_channel = channel_in
+        for k in range(0, len(reg_channels)):
+            reg_layers.append(
+                ConvModule(
+                    pre_channel,
+                    reg_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    inplace=True))
+            pre_channel = reg_channels[k]
+        reg_layers.append(
+            ConvModule(
+                pre_channel,
+                self.bbox_coder.code_size,
+                1,
+                padding=0,
+                conv_cfg=conv_cfg,
+                act_cfg=None))
+        if dropout_ratio >= 0:
+            reg_layers.insert(1, nn.Dropout(dropout_ratio))
+
+        self.conv_reg = nn.Sequential(*reg_layers)
+
+        if init_cfg is None:
+            self.init_cfg = dict(
+                type='Xavier',
+                layer=['Conv2d', 'Conv1d'],
+                distribution='uniform')
+
+    def init_weights(self):
+        super().init_weights()
+        normal_init(self.conv_reg[-1].conv, mean=0, std=0.001)
+
+    def forward(self, seg_feats, part_feats):
+        """Forward pass.
+
+        Args:
+            seg_feats (torch.Tensor): Point-wise semantic features.
+            part_feats (torch.Tensor): Point-wise part prediction features.
+
+        Returns:
+            tuple[torch.Tensor]: Score of class and bbox predictions.
+        """
+        # (B * N, out_x, out_y, out_z, 4)
+        rcnn_batch_size = part_feats.shape[0]
+
+        # transform to sparse tensors
+        sparse_shape = part_feats.shape[1:4]
+        # (non_empty_num, 4) ==> [bs_idx, x_idx, y_idx, z_idx]
+        sparse_idx = part_feats.sum(dim=-1).nonzero(as_tuple=False)
+
+        part_features = part_feats[sparse_idx[:, 0], sparse_idx[:, 1],
+                                   sparse_idx[:, 2], sparse_idx[:, 3]]
+        seg_features = seg_feats[sparse_idx[:, 0], sparse_idx[:, 1],
+                                 sparse_idx[:, 2], sparse_idx[:, 3]]
+        coords = sparse_idx.int().contiguous()
+        part_features = SparseConvTensor(part_features, coords, sparse_shape,
+                                         rcnn_batch_size)
+        seg_features = SparseConvTensor(seg_features, coords, sparse_shape,
+                                        rcnn_batch_size)
+
+        # forward rcnn network
+        x_part = self.part_conv(part_features)
+        x_rpn = self.seg_conv(seg_features)
+
+        merged_feature = torch.cat((x_rpn.features, x_part.features),
+                                   dim=1)  # (N, C)
+        shared_feature = SparseConvTensor(merged_feature, coords, sparse_shape,
+                                          rcnn_batch_size)
+
+        x = self.conv_down(shared_feature)
+
+        shared_feature = x.dense().view(rcnn_batch_size, -1, 1)
+
+        shared_feature = self.shared_fc(shared_feature)
+
+        cls_score = self.conv_cls(shared_feature).transpose(
+            1, 2).contiguous().squeeze(dim=1)  # (B, 1)
+        bbox_pred = self.conv_reg(shared_feature).transpose(
+            1, 2).contiguous().squeeze(dim=1)  # (B, C)
+
+        return cls_score, bbox_pred
+
+    def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets,
+             pos_gt_bboxes, reg_mask, label_weights, bbox_weights):
+        """Computing losses.
+
+        Args:
+            cls_score (torch.Tensor): Scores of each roi.
+            bbox_pred (torch.Tensor): Predictions of bboxes.
+            rois (torch.Tensor): Roi bboxes.
+            labels (torch.Tensor): Labels of class.
+            bbox_targets (torch.Tensor): Target of positive bboxes.
+            pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes.
+            reg_mask (torch.Tensor): Mask for positive bboxes.
+            label_weights (torch.Tensor): Weights of class loss.
+            bbox_weights (torch.Tensor): Weights of bbox loss.
+
+        Returns:
+            dict: Computed losses.
+
+                - loss_cls (torch.Tensor): Loss of classes.
+                - loss_bbox (torch.Tensor): Loss of bboxes.
+                - loss_corner (torch.Tensor): Loss of corners.
+        """
+        losses = dict()
+        rcnn_batch_size = cls_score.shape[0]
+
+        # calculate class loss
+        cls_flat = cls_score.view(-1)
+        loss_cls = self.loss_cls(cls_flat, labels, label_weights)
+        losses['loss_cls'] = loss_cls
+
+        # calculate regression loss
+        code_size = self.bbox_coder.code_size
+        pos_inds = (reg_mask > 0)
+        if pos_inds.any() == 0:
+            # fake a part loss
+            losses['loss_bbox'] = loss_cls.new_tensor(0)
+            if self.with_corner_loss:
+                losses['loss_corner'] = loss_cls.new_tensor(0)
+        else:
+            pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds]
+            bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat(
+                1, pos_bbox_pred.shape[-1])
+            loss_bbox = self.loss_bbox(
+                pos_bbox_pred.unsqueeze(dim=0), bbox_targets.unsqueeze(dim=0),
+                bbox_weights_flat.unsqueeze(dim=0))
+            losses['loss_bbox'] = loss_bbox
+
+            if self.with_corner_loss:
+                pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds]
+                pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size)
+                batch_anchors = pos_roi_boxes3d.clone().detach()
+                pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1)
+                roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3)
+                batch_anchors[..., 0:3] = 0
+                # decode boxes
+                pred_boxes3d = self.bbox_coder.decode(
+                    batch_anchors,
+                    pos_bbox_pred.view(-1, code_size)).view(-1, code_size)
+
+                pred_boxes3d[..., 0:3] = rotation_3d_in_axis(
+                    pred_boxes3d[..., 0:3].unsqueeze(1),
+                    pos_rois_rotation,
+                    axis=2).squeeze(1)
+
+                pred_boxes3d[:, 0:3] += roi_xyz
+
+                # calculate corner loss
+                loss_corner = self.get_corner_loss_lidar(
+                    pred_boxes3d, pos_gt_bboxes)
+                losses['loss_corner'] = loss_corner
+
+        return losses
+
+    def get_targets(self, sampling_results, rcnn_train_cfg, concat=True):
+        """Generate targets.
+
+        Args:
+            sampling_results (list[:obj:`SamplingResult`]):
+                Sampled results from rois.
+            rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn.
+            concat (bool): Whether to concatenate targets between batches.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of boxes and class prediction.
+        """
+        pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        iou_list = [res.iou for res in sampling_results]
+        targets = multi_apply(
+            self._get_target_single,
+            pos_bboxes_list,
+            pos_gt_bboxes_list,
+            iou_list,
+            cfg=rcnn_train_cfg)
+
+        (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+         bbox_weights) = targets
+
+        if concat:
+            label = torch.cat(label, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0)
+            reg_mask = torch.cat(reg_mask, 0)
+
+            label_weights = torch.cat(label_weights, 0)
+            label_weights /= torch.clamp(label_weights.sum(), min=1.0)
+
+            bbox_weights = torch.cat(bbox_weights, 0)
+            bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0)
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg):
+        """Generate training targets for a single sample.
+
+        Args:
+            pos_bboxes (torch.Tensor): Positive boxes with shape
+                (N, 7).
+            pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape
+                (M, 7).
+            ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes`
+                in shape (N, M).
+            cfg (dict): Training configs.
+
+        Returns:
+            tuple[torch.Tensor]: Target for positive boxes.
+                (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+        """
+        cls_pos_mask = ious > cfg.cls_pos_thr
+        cls_neg_mask = ious < cfg.cls_neg_thr
+        interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0)
+
+        # iou regression target
+        label = (cls_pos_mask > 0).float()
+        label[interval_mask] = ious[interval_mask] * 2 - 0.5
+        # label weights
+        label_weights = (label >= 0).float()
+
+        # box regression target
+        reg_mask = pos_bboxes.new_zeros(ious.size(0)).long()
+        reg_mask[0:pos_gt_bboxes.size(0)] = 1
+        bbox_weights = (reg_mask > 0).float()
+        if reg_mask.bool().any():
+            pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach()
+            roi_center = pos_bboxes[..., 0:3]
+            roi_ry = pos_bboxes[..., 6] % (2 * np.pi)
+
+            # canonical transformation
+            pos_gt_bboxes_ct[..., 0:3] -= roi_center
+            pos_gt_bboxes_ct[..., 6] -= roi_ry
+            pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis(
+                pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -roi_ry,
+                axis=2).squeeze(1)
+
+            # flip orientation if rois have opposite orientation
+            ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi)  # 0 ~ 2pi
+            opposite_flag = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5)
+            ry_label[opposite_flag] = (ry_label[opposite_flag] + np.pi) % (
+                2 * np.pi)  # (0 ~ pi/2, 3pi/2 ~ 2pi)
+            flag = ry_label > np.pi
+            ry_label[flag] = ry_label[flag] - np.pi * 2  # (-pi/2, pi/2)
+            ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2)
+            pos_gt_bboxes_ct[..., 6] = ry_label
+
+            rois_anchor = pos_bboxes.clone().detach()
+            rois_anchor[:, 0:3] = 0
+            rois_anchor[:, 6] = 0
+            bbox_targets = self.bbox_coder.encode(rois_anchor,
+                                                  pos_gt_bboxes_ct)
+        else:
+            # no fg bbox
+            bbox_targets = pos_gt_bboxes.new_empty((0, 7))
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1.0):
+        """Calculate corner loss of given boxes.
+
+        Args:
+            pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7).
+            gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7).
+            delta (float, optional): huber loss threshold. Defaults to 1.0
+
+        Returns:
+            torch.FloatTensor: Calculated corner loss in shape (N).
+        """
+        assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0]
+
+        # This is a little bit hack here because we assume the box for
+        # Part-A2 is in LiDAR coordinates
+        gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d)
+        pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners
+        gt_box_corners = gt_boxes_structure.corners
+
+        # This flip only changes the heading direction of GT boxes
+        gt_bbox3d_flip = gt_boxes_structure.clone()
+        gt_bbox3d_flip.tensor[:, 6] += np.pi
+        gt_box_corners_flip = gt_bbox3d_flip.corners
+
+        corner_dist = torch.min(
+            torch.norm(pred_box_corners - gt_box_corners, dim=2),
+            torch.norm(pred_box_corners - gt_box_corners_flip,
+                       dim=2))  # (N, 8)
+        # huber loss
+        abs_error = corner_dist.abs()
+        quadratic = abs_error.clamp(max=delta)
+        linear = (abs_error - quadratic)
+        corner_loss = 0.5 * quadratic**2 + delta * linear
+
+        return corner_loss.mean(dim=1)
+
+    def get_bboxes(self,
+                   rois,
+                   cls_score,
+                   bbox_pred,
+                   class_labels,
+                   class_pred,
+                   img_metas,
+                   cfg=None):
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            rois (torch.Tensor): Roi bounding boxes.
+            cls_score (torch.Tensor): Scores of bounding boxes.
+            bbox_pred (torch.Tensor): Bounding boxes predictions
+            class_labels (torch.Tensor): Label of classes
+            class_pred (torch.Tensor): Score for nms.
+            img_metas (list[dict]): Point cloud and image's meta info.
+            cfg (:obj:`ConfigDict`): Testing config.
+
+        Returns:
+            list[tuple]: Decoded bbox, scores and labels after nms.
+        """
+        roi_batch_id = rois[..., 0]
+        roi_boxes = rois[..., 1:]  # boxes without batch id
+        batch_size = int(roi_batch_id.max().item() + 1)
+
+        # decode boxes
+        roi_ry = roi_boxes[..., 6].view(-1)
+        roi_xyz = roi_boxes[..., 0:3].view(-1, 3)
+        local_roi_boxes = roi_boxes.clone().detach()
+        local_roi_boxes[..., 0:3] = 0
+        rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred)
+        rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis(
+            rcnn_boxes3d[..., 0:3].unsqueeze(1), roi_ry, axis=2).squeeze(1)
+        rcnn_boxes3d[:, 0:3] += roi_xyz
+
+        # post processing
+        result_list = []
+        for batch_id in range(batch_size):
+            cur_class_labels = class_labels[batch_id]
+            cur_cls_score = cls_score[roi_batch_id == batch_id].view(-1)
+
+            cur_box_prob = class_pred[batch_id]
+            cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id]
+            keep = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d,
+                                        cfg.score_thr, cfg.nms_thr,
+                                        img_metas[batch_id],
+                                        cfg.use_rotate_nms)
+            selected_bboxes = cur_rcnn_boxes3d[keep]
+            selected_label_preds = cur_class_labels[keep]
+            selected_scores = cur_cls_score[keep]
+
+            result_list.append(
+                (img_metas[batch_id]['box_type_3d'](selected_bboxes,
+                                                    self.bbox_coder.code_size),
+                 selected_scores, selected_label_preds))
+        return result_list
+
+    def multi_class_nms(self,
+                        box_probs,
+                        box_preds,
+                        score_thr,
+                        nms_thr,
+                        input_meta,
+                        use_rotate_nms=True):
+        """Multi-class NMS for box head.
+
+        Note:
+            This function has large overlap with the `box3d_multiclass_nms`
+            implemented in `mmdet3d.core.post_processing`. We are considering
+            merging these two functions in the future.
+
+        Args:
+            box_probs (torch.Tensor): Predicted boxes probabitilies in
+                shape (N,).
+            box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C).
+            score_thr (float): Threshold of scores.
+            nms_thr (float): Threshold for NMS.
+            input_meta (dict): Meta information of the current sample.
+            use_rotate_nms (bool, optional): Whether to use rotated nms.
+                Defaults to True.
+
+        Returns:
+            torch.Tensor: Selected indices.
+        """
+        if use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+
+        assert box_probs.shape[
+            1] == self.num_classes, f'box_probs shape: {str(box_probs.shape)}'
+        selected_list = []
+        selected_labels = []
+        boxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            box_preds, self.bbox_coder.code_size).bev)
+
+        score_thresh = score_thr if isinstance(
+            score_thr, list) else [score_thr for x in range(self.num_classes)]
+        nms_thresh = nms_thr if isinstance(
+            nms_thr, list) else [nms_thr for x in range(self.num_classes)]
+        for k in range(0, self.num_classes):
+            class_scores_keep = box_probs[:, k] >= score_thresh[k]
+
+            if class_scores_keep.int().sum() > 0:
+                original_idxs = class_scores_keep.nonzero(
+                    as_tuple=False).view(-1)
+                cur_boxes_for_nms = boxes_for_nms[class_scores_keep]
+                cur_rank_scores = box_probs[class_scores_keep, k]
+
+                cur_selected = nms_func(cur_boxes_for_nms, cur_rank_scores,
+                                        nms_thresh[k])
+
+                if cur_selected.shape[0] == 0:
+                    continue
+                selected_list.append(original_idxs[cur_selected])
+                selected_labels.append(
+                    torch.full([cur_selected.shape[0]],
+                               k + 1,
+                               dtype=torch.int64,
+                               device=box_preds.device))
+
+        keep = torch.cat(
+            selected_list, dim=0) if len(selected_list) > 0 else []
+        return keep
diff --git a/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py b/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py
index df46921..21dd93f 100644
--- a/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py
+++ b/mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py
@@ -1,575 +1,575 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from mmcv.cnn import ConvModule, normal_init
-from mmcv.cnn.bricks import build_conv_layer
-from mmcv.runner import BaseModule
-from torch import nn as nn
-
-from mmdet3d.core.bbox.structures import (LiDARInstance3DBoxes,
-                                          rotation_3d_in_axis, xywhr2xyxyr)
-from mmdet3d.core.post_processing import nms_bev, nms_normal_bev
-from mmdet3d.models.builder import HEADS, build_loss
-from mmdet3d.ops import build_sa_module
-from mmdet.core import build_bbox_coder, multi_apply
-
-
-@HEADS.register_module()
-class PointRCNNBboxHead(BaseModule):
-    """PointRCNN RoI Bbox head.
-
-    Args:
-        num_classes (int): The number of classes to prediction.
-        in_channels (int)： Input channels of point features.
-        mlp_channels (list[int]): the number of mlp channels
-        pred_layer_cfg (dict, optional): Config of classfication and
-            regression prediction layers. Defaults to None.
-        num_points (tuple, optional): The number of points which each SA
-            module samples. Defaults to (128, 32, -1).
-        radius (tuple, optional): Sampling radius of each SA module.
-            Defaults to (0.2, 0.4, 100).
-        num_samples (tuple, optional): The number of samples for ball query
-            in each SA module. Defaults to (64, 64, 64).
-        sa_channels (tuple, optional): Out channels of each mlp in SA module.
-            Defaults to ((128, 128, 128), (128, 128, 256), (256, 256, 512)).
-        bbox_coder (dict, optional): Config dict of box coders.
-            Defaults to dict(type='DeltaXYZWLHRBBoxCoder').
-        sa_cfg (dict, optional): Config of set abstraction module, which may
-            contain the following keys and values:
-
-            - pool_mod (str): Pool method ('max' or 'avg') for SA modules.
-            - use_xyz (bool): Whether to use xyz as a part of features.
-            - normalize_xyz (bool): Whether to normalize xyz with radii in
-              each SA module.
-            Defaults to dict(type='PointSAModule', pool_mod='max',
-                use_xyz=True).
-        conv_cfg (dict, optional): Config dict of convolutional layers.
-             Defaults to dict(type='Conv1d').
-        norm_cfg (dict, optional): Config dict of normalization layers.
-             Defaults to dict(type='BN1d').
-        act_cfg (dict, optional): Config dict of activation layers.
-            Defaults to dict(type='ReLU').
-        bias (str, optional): Type of bias. Defaults to 'auto'.
-        loss_bbox (dict, optional): Config of regression loss function.
-            Defaults to dict(type='SmoothL1Loss', beta=1.0 / 9.0,
-                reduction='sum', loss_weight=1.0).
-        loss_cls (dict, optional): Config of classification loss function.
-             Defaults to dict(type='CrossEntropyLoss', use_sigmoid=True,
-                reduction='sum', loss_weight=1.0).
-        with_corner_loss (bool, optional): Whether using corner loss.
-            Defaults to True.
-        init_cfg (dict, optional): Config of initialization. Defaults to None.
-    """
-
-    def __init__(
-            self,
-            num_classes,
-            in_channels,
-            mlp_channels,
-            pred_layer_cfg=None,
-            num_points=(128, 32, -1),
-            radius=(0.2, 0.4, 100),
-            num_samples=(64, 64, 64),
-            sa_channels=((128, 128, 128), (128, 128, 256), (256, 256, 512)),
-            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
-            sa_cfg=dict(type='PointSAModule', pool_mod='max', use_xyz=True),
-            conv_cfg=dict(type='Conv1d'),
-            norm_cfg=dict(type='BN1d'),
-            act_cfg=dict(type='ReLU'),
-            bias='auto',
-            loss_bbox=dict(
-                type='SmoothL1Loss',
-                beta=1.0 / 9.0,
-                reduction='sum',
-                loss_weight=1.0),
-            loss_cls=dict(
-                type='CrossEntropyLoss',
-                use_sigmoid=True,
-                reduction='sum',
-                loss_weight=1.0),
-            with_corner_loss=True,
-            init_cfg=None):
-        super(PointRCNNBboxHead, self).__init__(init_cfg=init_cfg)
-        self.num_classes = num_classes
-        self.num_sa = len(sa_channels)
-        self.with_corner_loss = with_corner_loss
-        self.conv_cfg = conv_cfg
-        self.norm_cfg = norm_cfg
-        self.act_cfg = act_cfg
-        self.bias = bias
-
-        self.loss_bbox = build_loss(loss_bbox)
-        self.loss_cls = build_loss(loss_cls)
-        self.bbox_coder = build_bbox_coder(bbox_coder)
-        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
-
-        self.in_channels = in_channels
-        mlp_channels = [self.in_channels] + mlp_channels
-        shared_mlps = nn.Sequential()
-        for i in range(len(mlp_channels) - 1):
-            shared_mlps.add_module(
-                f'layer{i}',
-                ConvModule(
-                    mlp_channels[i],
-                    mlp_channels[i + 1],
-                    kernel_size=(1, 1),
-                    stride=(1, 1),
-                    inplace=False,
-                    conv_cfg=dict(type='Conv2d')))
-        self.xyz_up_layer = nn.Sequential(*shared_mlps)
-
-        c_out = mlp_channels[-1]
-        self.merge_down_layer = ConvModule(
-            c_out * 2,
-            c_out,
-            kernel_size=(1, 1),
-            stride=(1, 1),
-            inplace=False,
-            conv_cfg=dict(type='Conv2d'))
-
-        pre_channels = c_out
-
-        self.SA_modules = nn.ModuleList()
-        sa_in_channel = pre_channels
-
-        for sa_index in range(self.num_sa):
-            cur_sa_mlps = list(sa_channels[sa_index])
-            cur_sa_mlps = [sa_in_channel] + cur_sa_mlps
-            sa_out_channel = cur_sa_mlps[-1]
-
-            cur_num_points = num_points[sa_index]
-            if cur_num_points <= 0:
-                cur_num_points = None
-            self.SA_modules.append(
-                build_sa_module(
-                    num_point=cur_num_points,
-                    radius=radius[sa_index],
-                    num_sample=num_samples[sa_index],
-                    mlp_channels=cur_sa_mlps,
-                    cfg=sa_cfg))
-            sa_in_channel = sa_out_channel
-        self.cls_convs = self._add_conv_branch(
-            pred_layer_cfg.in_channels, pred_layer_cfg.cls_conv_channels)
-        self.reg_convs = self._add_conv_branch(
-            pred_layer_cfg.in_channels, pred_layer_cfg.reg_conv_channels)
-
-        prev_channel = pred_layer_cfg.cls_conv_channels[-1]
-        self.conv_cls = build_conv_layer(
-            self.conv_cfg,
-            in_channels=prev_channel,
-            out_channels=self.num_classes,
-            kernel_size=1)
-        prev_channel = pred_layer_cfg.reg_conv_channels[-1]
-        self.conv_reg = build_conv_layer(
-            self.conv_cfg,
-            in_channels=prev_channel,
-            out_channels=self.bbox_coder.code_size * self.num_classes,
-            kernel_size=1)
-
-        if init_cfg is None:
-            self.init_cfg = dict(type='Xavier', layer=['Conv2d', 'Conv1d'])
-
-    def _add_conv_branch(self, in_channels, conv_channels):
-        """Add shared or separable branch.
-
-        Args:
-            in_channels (int): Input feature channel.
-            conv_channels (tuple): Middle feature channels.
-        """
-        conv_spec = [in_channels] + list(conv_channels)
-        # add branch specific conv layers
-        conv_layers = nn.Sequential()
-        for i in range(len(conv_spec) - 1):
-            conv_layers.add_module(
-                f'layer{i}',
-                ConvModule(
-                    conv_spec[i],
-                    conv_spec[i + 1],
-                    kernel_size=1,
-                    padding=0,
-                    conv_cfg=self.conv_cfg,
-                    norm_cfg=self.norm_cfg,
-                    act_cfg=self.act_cfg,
-                    bias=self.bias,
-                    inplace=True))
-        return conv_layers
-
-    def init_weights(self):
-        """Initialize weights of the head."""
-        super().init_weights()
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d):
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-        normal_init(self.conv_reg.weight, mean=0, std=0.001)
-
-    def forward(self, feats):
-        """Forward pass.
-
-        Args:
-            feats (torch.Torch): Features from RCNN modules.
-
-        Returns:
-            tuple[torch.Tensor]: Score of class and bbox predictions.
-        """
-        input_data = feats.clone().detach()
-        xyz_input = input_data[..., 0:self.in_channels].transpose(
-            1, 2).unsqueeze(dim=3).contiguous().clone().detach()
-        xyz_features = self.xyz_up_layer(xyz_input)
-        rpn_features = input_data[..., self.in_channels:].transpose(
-            1, 2).unsqueeze(dim=3)
-        merged_features = torch.cat((xyz_features, rpn_features), dim=1)
-        merged_features = self.merge_down_layer(merged_features)
-        l_xyz, l_features = [input_data[..., 0:3].contiguous()], \
-                            [merged_features.squeeze(dim=3)]
-        for i in range(len(self.SA_modules)):
-            li_xyz, li_features, cur_indices = \
-                self.SA_modules[i](l_xyz[i], l_features[i])
-            l_xyz.append(li_xyz)
-            l_features.append(li_features)
-
-        shared_features = l_features[-1]
-        x_cls = shared_features
-        x_reg = shared_features
-        x_cls = self.cls_convs(x_cls)
-        rcnn_cls = self.conv_cls(x_cls)
-        x_reg = self.reg_convs(x_reg)
-        rcnn_reg = self.conv_reg(x_reg)
-        rcnn_cls = rcnn_cls.transpose(1, 2).contiguous().squeeze(dim=1)
-        rcnn_reg = rcnn_reg.transpose(1, 2).contiguous().squeeze(dim=1)
-        return rcnn_cls, rcnn_reg
-
-    def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets,
-             pos_gt_bboxes, reg_mask, label_weights, bbox_weights):
-        """Computing losses.
-
-        Args:
-            cls_score (torch.Tensor): Scores of each RoI.
-            bbox_pred (torch.Tensor): Predictions of bboxes.
-            rois (torch.Tensor): RoI bboxes.
-            labels (torch.Tensor): Labels of class.
-            bbox_targets (torch.Tensor): Target of positive bboxes.
-            pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes.
-            reg_mask (torch.Tensor): Mask for positive bboxes.
-            label_weights (torch.Tensor): Weights of class loss.
-            bbox_weights (torch.Tensor): Weights of bbox loss.
-
-        Returns:
-            dict: Computed losses.
-
-                - loss_cls (torch.Tensor): Loss of classes.
-                - loss_bbox (torch.Tensor): Loss of bboxes.
-                - loss_corner (torch.Tensor): Loss of corners.
-        """
-        losses = dict()
-        rcnn_batch_size = cls_score.shape[0]
-        # calculate class loss
-        cls_flat = cls_score.view(-1)
-        loss_cls = self.loss_cls(cls_flat, labels, label_weights)
-        losses['loss_cls'] = loss_cls
-
-        # calculate regression loss
-        code_size = self.bbox_coder.code_size
-        pos_inds = (reg_mask > 0)
-
-        pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds].clone()
-        bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat(
-            1, pos_bbox_pred.shape[-1])
-        loss_bbox = self.loss_bbox(
-            pos_bbox_pred.unsqueeze(dim=0),
-            bbox_targets.unsqueeze(dim=0).detach(),
-            bbox_weights_flat.unsqueeze(dim=0))
-        losses['loss_bbox'] = loss_bbox
-
-        if pos_inds.any() != 0 and self.with_corner_loss:
-            rois = rois.detach()
-            pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds]
-            pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size)
-            batch_anchors = pos_roi_boxes3d.clone().detach()
-            pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1)
-            roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3)
-            batch_anchors[..., 0:3] = 0
-            # decode boxes
-            pred_boxes3d = self.bbox_coder.decode(
-                batch_anchors,
-                pos_bbox_pred.view(-1, code_size)).view(-1, code_size)
-
-            pred_boxes3d[..., 0:3] = rotation_3d_in_axis(
-                pred_boxes3d[..., 0:3].unsqueeze(1), (pos_rois_rotation),
-                axis=2).squeeze(1)
-
-            pred_boxes3d[:, 0:3] += roi_xyz
-
-            # calculate corner loss
-            loss_corner = self.get_corner_loss_lidar(pred_boxes3d,
-                                                     pos_gt_bboxes)
-
-            losses['loss_corner'] = loss_corner
-        else:
-            losses['loss_corner'] = loss_cls.new_tensor(0)
-
-        return losses
-
-    def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1.0):
-        """Calculate corner loss of given boxes.
-
-        Args:
-            pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7).
-            gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7).
-            delta (float, optional): huber loss threshold. Defaults to 1.0
-
-        Returns:
-            torch.FloatTensor: Calculated corner loss in shape (N).
-        """
-        assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0]
-
-        # This is a little bit hack here because we assume the box for
-        # PointRCNN is in LiDAR coordinates
-
-        gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d)
-        pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners
-        gt_box_corners = gt_boxes_structure.corners
-
-        # This flip only changes the heading direction of GT boxes
-        gt_bbox3d_flip = gt_boxes_structure.clone()
-        gt_bbox3d_flip.tensor[:, 6] += np.pi
-        gt_box_corners_flip = gt_bbox3d_flip.corners
-
-        corner_dist = torch.min(
-            torch.norm(pred_box_corners - gt_box_corners, dim=2),
-            torch.norm(pred_box_corners - gt_box_corners_flip, dim=2))
-        # huber loss
-        abs_error = corner_dist.abs()
-        quadratic = abs_error.clamp(max=delta)
-        linear = (abs_error - quadratic)
-        corner_loss = 0.5 * quadratic**2 + delta * linear
-        return corner_loss.mean(dim=1)
-
-    def get_targets(self, sampling_results, rcnn_train_cfg, concat=True):
-        """Generate targets.
-
-        Args:
-            sampling_results (list[:obj:`SamplingResult`]):
-                Sampled results from rois.
-            rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn.
-            concat (bool, optional): Whether to concatenate targets between
-                batches. Defaults to True.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of boxes and class prediction.
-        """
-        pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
-        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
-        iou_list = [res.iou for res in sampling_results]
-        targets = multi_apply(
-            self._get_target_single,
-            pos_bboxes_list,
-            pos_gt_bboxes_list,
-            iou_list,
-            cfg=rcnn_train_cfg)
-        (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
-         bbox_weights) = targets
-
-        if concat:
-            label = torch.cat(label, 0)
-            bbox_targets = torch.cat(bbox_targets, 0)
-            pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0)
-            reg_mask = torch.cat(reg_mask, 0)
-
-            label_weights = torch.cat(label_weights, 0)
-            label_weights /= torch.clamp(label_weights.sum(), min=1.0)
-
-            bbox_weights = torch.cat(bbox_weights, 0)
-            bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0)
-
-        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
-                bbox_weights)
-
-    def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg):
-        """Generate training targets for a single sample.
-
-        Args:
-            pos_bboxes (torch.Tensor): Positive boxes with shape
-                (N, 7).
-            pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape
-                (M, 7).
-            ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes`
-                in shape (N, M).
-            cfg (dict): Training configs.
-
-        Returns:
-            tuple[torch.Tensor]: Target for positive boxes.
-                (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
-                bbox_weights)
-        """
-        cls_pos_mask = ious > cfg.cls_pos_thr
-        cls_neg_mask = ious < cfg.cls_neg_thr
-        interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0)
-        # iou regression target
-        label = (cls_pos_mask > 0).float()
-        label[interval_mask] = (ious[interval_mask] - cfg.cls_neg_thr) / \
-            (cfg.cls_pos_thr - cfg.cls_neg_thr)
-        # label weights
-        label_weights = (label >= 0).float()
-        # box regression target
-        reg_mask = pos_bboxes.new_zeros(ious.size(0)).long()
-        reg_mask[0:pos_gt_bboxes.size(0)] = 1
-        bbox_weights = (reg_mask > 0).float()
-        if reg_mask.bool().any():
-            pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach()
-            roi_center = pos_bboxes[..., 0:3]
-            roi_ry = pos_bboxes[..., 6] % (2 * np.pi)
-
-            # canonical transformation
-            pos_gt_bboxes_ct[..., 0:3] -= roi_center
-            pos_gt_bboxes_ct[..., 6] -= roi_ry
-            pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis(
-                pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -(roi_ry),
-                axis=2).squeeze(1)
-
-            # flip orientation if gt have opposite orientation
-            ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi)  # 0 ~ 2pi
-            is_opposite = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5)
-            ry_label[is_opposite] = (ry_label[is_opposite] + np.pi) % (
-                2 * np.pi)  # (0 ~ pi/2, 3pi/2 ~ 2pi)
-            flag = ry_label > np.pi
-            ry_label[flag] = ry_label[flag] - np.pi * 2  # (-pi/2, pi/2)
-            ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2)
-            pos_gt_bboxes_ct[..., 6] = ry_label
-
-            rois_anchor = pos_bboxes.clone().detach()
-            rois_anchor[:, 0:3] = 0
-            rois_anchor[:, 6] = 0
-            bbox_targets = self.bbox_coder.encode(rois_anchor,
-                                                  pos_gt_bboxes_ct)
-        else:
-            # no fg bbox
-            bbox_targets = pos_gt_bboxes.new_empty((0, 7))
-
-        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
-                bbox_weights)
-
-    def get_bboxes(self,
-                   rois,
-                   cls_score,
-                   bbox_pred,
-                   class_labels,
-                   img_metas,
-                   cfg=None):
-        """Generate bboxes from bbox head predictions.
-
-        Args:
-            rois (torch.Tensor): RoI bounding boxes.
-            cls_score (torch.Tensor): Scores of bounding boxes.
-            bbox_pred (torch.Tensor): Bounding boxes predictions
-            class_labels (torch.Tensor): Label of classes
-            img_metas (list[dict]): Point cloud and image's meta info.
-            cfg (:obj:`ConfigDict`, optional): Testing config.
-                Defaults to None.
-
-        Returns:
-            list[tuple]: Decoded bbox, scores and labels after nms.
-        """
-        roi_batch_id = rois[..., 0]
-        roi_boxes = rois[..., 1:]  # boxes without batch id
-        batch_size = int(roi_batch_id.max().item() + 1)
-
-        # decode boxes
-        roi_ry = roi_boxes[..., 6].view(-1)
-        roi_xyz = roi_boxes[..., 0:3].view(-1, 3)
-        local_roi_boxes = roi_boxes.clone().detach()
-        local_roi_boxes[..., 0:3] = 0
-        rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred)
-        rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis(
-            rcnn_boxes3d[..., 0:3].unsqueeze(1), roi_ry, axis=2).squeeze(1)
-        rcnn_boxes3d[:, 0:3] += roi_xyz
-
-        # post processing
-        result_list = []
-        for batch_id in range(batch_size):
-            cur_class_labels = class_labels[batch_id]
-            cur_cls_score = cls_score[roi_batch_id == batch_id].view(-1)
-
-            cur_box_prob = cur_cls_score.unsqueeze(1)
-            cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id]
-            keep = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d,
-                                        cfg.score_thr, cfg.nms_thr,
-                                        img_metas[batch_id],
-                                        cfg.use_rotate_nms)
-            selected_bboxes = cur_rcnn_boxes3d[keep]
-            selected_label_preds = cur_class_labels[keep]
-            selected_scores = cur_cls_score[keep]
-
-            result_list.append(
-                (img_metas[batch_id]['box_type_3d'](selected_bboxes,
-                                                    self.bbox_coder.code_size),
-                 selected_scores, selected_label_preds))
-        return result_list
-
-    def multi_class_nms(self,
-                        box_probs,
-                        box_preds,
-                        score_thr,
-                        nms_thr,
-                        input_meta,
-                        use_rotate_nms=True):
-        """Multi-class NMS for box head.
-
-        Note:
-            This function has large overlap with the `box3d_multiclass_nms`
-            implemented in `mmdet3d.core.post_processing`. We are considering
-            merging these two functions in the future.
-
-        Args:
-            box_probs (torch.Tensor): Predicted boxes probabilities in
-                shape (N,).
-            box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C).
-            score_thr (float): Threshold of scores.
-            nms_thr (float): Threshold for NMS.
-            input_meta (dict): Meta information of the current sample.
-            use_rotate_nms (bool, optional): Whether to use rotated nms.
-                Defaults to True.
-
-        Returns:
-            torch.Tensor: Selected indices.
-        """
-        if use_rotate_nms:
-            nms_func = nms_bev
-        else:
-            nms_func = nms_normal_bev
-
-        assert box_probs.shape[
-            1] == self.num_classes, f'box_probs shape: {str(box_probs.shape)}'
-        selected_list = []
-        selected_labels = []
-        boxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
-            box_preds, self.bbox_coder.code_size).bev)
-
-        score_thresh = score_thr if isinstance(
-            score_thr, list) else [score_thr for x in range(self.num_classes)]
-        nms_thresh = nms_thr if isinstance(
-            nms_thr, list) else [nms_thr for x in range(self.num_classes)]
-        for k in range(0, self.num_classes):
-            class_scores_keep = box_probs[:, k] >= score_thresh[k]
-
-            if class_scores_keep.int().sum() > 0:
-                original_idxs = class_scores_keep.nonzero(
-                    as_tuple=False).view(-1)
-                cur_boxes_for_nms = boxes_for_nms[class_scores_keep]
-                cur_rank_scores = box_probs[class_scores_keep, k]
-
-                cur_selected = nms_func(cur_boxes_for_nms, cur_rank_scores,
-                                        nms_thresh[k])
-
-                if cur_selected.shape[0] == 0:
-                    continue
-                selected_list.append(original_idxs[cur_selected])
-                selected_labels.append(
-                    torch.full([cur_selected.shape[0]],
-                               k + 1,
-                               dtype=torch.int64,
-                               device=box_preds.device))
-
-        keep = torch.cat(
-            selected_list, dim=0) if len(selected_list) > 0 else []
-        return keep
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.cnn import ConvModule, normal_init
+from mmcv.cnn.bricks import build_conv_layer
+from mmcv.runner import BaseModule
+from torch import nn as nn
+
+from mmdet3d.core.bbox.structures import (LiDARInstance3DBoxes,
+                                          rotation_3d_in_axis, xywhr2xyxyr)
+from mmdet3d.core.post_processing import nms_bev, nms_normal_bev
+from mmdet3d.models.builder import HEADS, build_loss
+from mmdet3d.ops import build_sa_module
+from mmdet.core import build_bbox_coder, multi_apply
+
+
+@HEADS.register_module()
+class PointRCNNBboxHead(BaseModule):
+    """PointRCNN RoI Bbox head.
+
+    Args:
+        num_classes (int): The number of classes to prediction.
+        in_channels (int)： Input channels of point features.
+        mlp_channels (list[int]): the number of mlp channels
+        pred_layer_cfg (dict, optional): Config of classfication and
+            regression prediction layers. Defaults to None.
+        num_points (tuple, optional): The number of points which each SA
+            module samples. Defaults to (128, 32, -1).
+        radius (tuple, optional): Sampling radius of each SA module.
+            Defaults to (0.2, 0.4, 100).
+        num_samples (tuple, optional): The number of samples for ball query
+            in each SA module. Defaults to (64, 64, 64).
+        sa_channels (tuple, optional): Out channels of each mlp in SA module.
+            Defaults to ((128, 128, 128), (128, 128, 256), (256, 256, 512)).
+        bbox_coder (dict, optional): Config dict of box coders.
+            Defaults to dict(type='DeltaXYZWLHRBBoxCoder').
+        sa_cfg (dict, optional): Config of set abstraction module, which may
+            contain the following keys and values:
+
+            - pool_mod (str): Pool method ('max' or 'avg') for SA modules.
+            - use_xyz (bool): Whether to use xyz as a part of features.
+            - normalize_xyz (bool): Whether to normalize xyz with radii in
+              each SA module.
+            Defaults to dict(type='PointSAModule', pool_mod='max',
+                use_xyz=True).
+        conv_cfg (dict, optional): Config dict of convolutional layers.
+             Defaults to dict(type='Conv1d').
+        norm_cfg (dict, optional): Config dict of normalization layers.
+             Defaults to dict(type='BN1d').
+        act_cfg (dict, optional): Config dict of activation layers.
+            Defaults to dict(type='ReLU').
+        bias (str, optional): Type of bias. Defaults to 'auto'.
+        loss_bbox (dict, optional): Config of regression loss function.
+            Defaults to dict(type='SmoothL1Loss', beta=1.0 / 9.0,
+                reduction='sum', loss_weight=1.0).
+        loss_cls (dict, optional): Config of classification loss function.
+             Defaults to dict(type='CrossEntropyLoss', use_sigmoid=True,
+                reduction='sum', loss_weight=1.0).
+        with_corner_loss (bool, optional): Whether using corner loss.
+            Defaults to True.
+        init_cfg (dict, optional): Config of initialization. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            num_classes,
+            in_channels,
+            mlp_channels,
+            pred_layer_cfg=None,
+            num_points=(128, 32, -1),
+            radius=(0.2, 0.4, 100),
+            num_samples=(64, 64, 64),
+            sa_channels=((128, 128, 128), (128, 128, 256), (256, 256, 512)),
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            sa_cfg=dict(type='PointSAModule', pool_mod='max', use_xyz=True),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            act_cfg=dict(type='ReLU'),
+            bias='auto',
+            loss_bbox=dict(
+                type='SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0),
+            with_corner_loss=True,
+            init_cfg=None):
+        super(PointRCNNBboxHead, self).__init__(init_cfg=init_cfg)
+        self.num_classes = num_classes
+        self.num_sa = len(sa_channels)
+        self.with_corner_loss = with_corner_loss
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.bias = bias
+
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_cls = build_loss(loss_cls)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+
+        self.in_channels = in_channels
+        mlp_channels = [self.in_channels] + mlp_channels
+        shared_mlps = nn.Sequential()
+        for i in range(len(mlp_channels) - 1):
+            shared_mlps.add_module(
+                f'layer{i}',
+                ConvModule(
+                    mlp_channels[i],
+                    mlp_channels[i + 1],
+                    kernel_size=(1, 1),
+                    stride=(1, 1),
+                    inplace=False,
+                    conv_cfg=dict(type='Conv2d')))
+        self.xyz_up_layer = nn.Sequential(*shared_mlps)
+
+        c_out = mlp_channels[-1]
+        self.merge_down_layer = ConvModule(
+            c_out * 2,
+            c_out,
+            kernel_size=(1, 1),
+            stride=(1, 1),
+            inplace=False,
+            conv_cfg=dict(type='Conv2d'))
+
+        pre_channels = c_out
+
+        self.SA_modules = nn.ModuleList()
+        sa_in_channel = pre_channels
+
+        for sa_index in range(self.num_sa):
+            cur_sa_mlps = list(sa_channels[sa_index])
+            cur_sa_mlps = [sa_in_channel] + cur_sa_mlps
+            sa_out_channel = cur_sa_mlps[-1]
+
+            cur_num_points = num_points[sa_index]
+            if cur_num_points <= 0:
+                cur_num_points = None
+            self.SA_modules.append(
+                build_sa_module(
+                    num_point=cur_num_points,
+                    radius=radius[sa_index],
+                    num_sample=num_samples[sa_index],
+                    mlp_channels=cur_sa_mlps,
+                    cfg=sa_cfg))
+            sa_in_channel = sa_out_channel
+        self.cls_convs = self._add_conv_branch(
+            pred_layer_cfg.in_channels, pred_layer_cfg.cls_conv_channels)
+        self.reg_convs = self._add_conv_branch(
+            pred_layer_cfg.in_channels, pred_layer_cfg.reg_conv_channels)
+
+        prev_channel = pred_layer_cfg.cls_conv_channels[-1]
+        self.conv_cls = build_conv_layer(
+            self.conv_cfg,
+            in_channels=prev_channel,
+            out_channels=self.num_classes,
+            kernel_size=1)
+        prev_channel = pred_layer_cfg.reg_conv_channels[-1]
+        self.conv_reg = build_conv_layer(
+            self.conv_cfg,
+            in_channels=prev_channel,
+            out_channels=self.bbox_coder.code_size * self.num_classes,
+            kernel_size=1)
+
+        if init_cfg is None:
+            self.init_cfg = dict(type='Xavier', layer=['Conv2d', 'Conv1d'])
+
+    def _add_conv_branch(self, in_channels, conv_channels):
+        """Add shared or separable branch.
+
+        Args:
+            in_channels (int): Input feature channel.
+            conv_channels (tuple): Middle feature channels.
+        """
+        conv_spec = [in_channels] + list(conv_channels)
+        # add branch specific conv layers
+        conv_layers = nn.Sequential()
+        for i in range(len(conv_spec) - 1):
+            conv_layers.add_module(
+                f'layer{i}',
+                ConvModule(
+                    conv_spec[i],
+                    conv_spec[i + 1],
+                    kernel_size=1,
+                    padding=0,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg,
+                    bias=self.bias,
+                    inplace=True))
+        return conv_layers
+
+    def init_weights(self):
+        """Initialize weights of the head."""
+        super().init_weights()
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d):
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+        normal_init(self.conv_reg.weight, mean=0, std=0.001)
+
+    def forward(self, feats):
+        """Forward pass.
+
+        Args:
+            feats (torch.Torch): Features from RCNN modules.
+
+        Returns:
+            tuple[torch.Tensor]: Score of class and bbox predictions.
+        """
+        input_data = feats.clone().detach()
+        xyz_input = input_data[..., 0:self.in_channels].transpose(
+            1, 2).unsqueeze(dim=3).contiguous().clone().detach()
+        xyz_features = self.xyz_up_layer(xyz_input)
+        rpn_features = input_data[..., self.in_channels:].transpose(
+            1, 2).unsqueeze(dim=3)
+        merged_features = torch.cat((xyz_features, rpn_features), dim=1)
+        merged_features = self.merge_down_layer(merged_features)
+        l_xyz, l_features = [input_data[..., 0:3].contiguous()], \
+                            [merged_features.squeeze(dim=3)]
+        for i in range(len(self.SA_modules)):
+            li_xyz, li_features, cur_indices = \
+                self.SA_modules[i](l_xyz[i], l_features[i])
+            l_xyz.append(li_xyz)
+            l_features.append(li_features)
+
+        shared_features = l_features[-1]
+        x_cls = shared_features
+        x_reg = shared_features
+        x_cls = self.cls_convs(x_cls)
+        rcnn_cls = self.conv_cls(x_cls)
+        x_reg = self.reg_convs(x_reg)
+        rcnn_reg = self.conv_reg(x_reg)
+        rcnn_cls = rcnn_cls.transpose(1, 2).contiguous().squeeze(dim=1)
+        rcnn_reg = rcnn_reg.transpose(1, 2).contiguous().squeeze(dim=1)
+        return rcnn_cls, rcnn_reg
+
+    def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets,
+             pos_gt_bboxes, reg_mask, label_weights, bbox_weights):
+        """Computing losses.
+
+        Args:
+            cls_score (torch.Tensor): Scores of each RoI.
+            bbox_pred (torch.Tensor): Predictions of bboxes.
+            rois (torch.Tensor): RoI bboxes.
+            labels (torch.Tensor): Labels of class.
+            bbox_targets (torch.Tensor): Target of positive bboxes.
+            pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes.
+            reg_mask (torch.Tensor): Mask for positive bboxes.
+            label_weights (torch.Tensor): Weights of class loss.
+            bbox_weights (torch.Tensor): Weights of bbox loss.
+
+        Returns:
+            dict: Computed losses.
+
+                - loss_cls (torch.Tensor): Loss of classes.
+                - loss_bbox (torch.Tensor): Loss of bboxes.
+                - loss_corner (torch.Tensor): Loss of corners.
+        """
+        losses = dict()
+        rcnn_batch_size = cls_score.shape[0]
+        # calculate class loss
+        cls_flat = cls_score.view(-1)
+        loss_cls = self.loss_cls(cls_flat, labels, label_weights)
+        losses['loss_cls'] = loss_cls
+
+        # calculate regression loss
+        code_size = self.bbox_coder.code_size
+        pos_inds = (reg_mask > 0)
+
+        pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds].clone()
+        bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat(
+            1, pos_bbox_pred.shape[-1])
+        loss_bbox = self.loss_bbox(
+            pos_bbox_pred.unsqueeze(dim=0),
+            bbox_targets.unsqueeze(dim=0).detach(),
+            bbox_weights_flat.unsqueeze(dim=0))
+        losses['loss_bbox'] = loss_bbox
+
+        if pos_inds.any() != 0 and self.with_corner_loss:
+            rois = rois.detach()
+            pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds]
+            pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size)
+            batch_anchors = pos_roi_boxes3d.clone().detach()
+            pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1)
+            roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3)
+            batch_anchors[..., 0:3] = 0
+            # decode boxes
+            pred_boxes3d = self.bbox_coder.decode(
+                batch_anchors,
+                pos_bbox_pred.view(-1, code_size)).view(-1, code_size)
+
+            pred_boxes3d[..., 0:3] = rotation_3d_in_axis(
+                pred_boxes3d[..., 0:3].unsqueeze(1), (pos_rois_rotation),
+                axis=2).squeeze(1)
+
+            pred_boxes3d[:, 0:3] += roi_xyz
+
+            # calculate corner loss
+            loss_corner = self.get_corner_loss_lidar(pred_boxes3d,
+                                                     pos_gt_bboxes)
+
+            losses['loss_corner'] = loss_corner
+        else:
+            losses['loss_corner'] = loss_cls.new_tensor(0)
+
+        return losses
+
+    def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1.0):
+        """Calculate corner loss of given boxes.
+
+        Args:
+            pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7).
+            gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7).
+            delta (float, optional): huber loss threshold. Defaults to 1.0
+
+        Returns:
+            torch.FloatTensor: Calculated corner loss in shape (N).
+        """
+        assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0]
+
+        # This is a little bit hack here because we assume the box for
+        # PointRCNN is in LiDAR coordinates
+
+        gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d)
+        pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners
+        gt_box_corners = gt_boxes_structure.corners
+
+        # This flip only changes the heading direction of GT boxes
+        gt_bbox3d_flip = gt_boxes_structure.clone()
+        gt_bbox3d_flip.tensor[:, 6] += np.pi
+        gt_box_corners_flip = gt_bbox3d_flip.corners
+
+        corner_dist = torch.min(
+            torch.norm(pred_box_corners - gt_box_corners, dim=2),
+            torch.norm(pred_box_corners - gt_box_corners_flip, dim=2))
+        # huber loss
+        abs_error = corner_dist.abs()
+        quadratic = abs_error.clamp(max=delta)
+        linear = (abs_error - quadratic)
+        corner_loss = 0.5 * quadratic**2 + delta * linear
+        return corner_loss.mean(dim=1)
+
+    def get_targets(self, sampling_results, rcnn_train_cfg, concat=True):
+        """Generate targets.
+
+        Args:
+            sampling_results (list[:obj:`SamplingResult`]):
+                Sampled results from rois.
+            rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn.
+            concat (bool, optional): Whether to concatenate targets between
+                batches. Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of boxes and class prediction.
+        """
+        pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
+        pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
+        iou_list = [res.iou for res in sampling_results]
+        targets = multi_apply(
+            self._get_target_single,
+            pos_bboxes_list,
+            pos_gt_bboxes_list,
+            iou_list,
+            cfg=rcnn_train_cfg)
+        (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+         bbox_weights) = targets
+
+        if concat:
+            label = torch.cat(label, 0)
+            bbox_targets = torch.cat(bbox_targets, 0)
+            pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0)
+            reg_mask = torch.cat(reg_mask, 0)
+
+            label_weights = torch.cat(label_weights, 0)
+            label_weights /= torch.clamp(label_weights.sum(), min=1.0)
+
+            bbox_weights = torch.cat(bbox_weights, 0)
+            bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0)
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg):
+        """Generate training targets for a single sample.
+
+        Args:
+            pos_bboxes (torch.Tensor): Positive boxes with shape
+                (N, 7).
+            pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape
+                (M, 7).
+            ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes`
+                in shape (N, M).
+            cfg (dict): Training configs.
+
+        Returns:
+            tuple[torch.Tensor]: Target for positive boxes.
+                (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+        """
+        cls_pos_mask = ious > cfg.cls_pos_thr
+        cls_neg_mask = ious < cfg.cls_neg_thr
+        interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0)
+        # iou regression target
+        label = (cls_pos_mask > 0).float()
+        label[interval_mask] = (ious[interval_mask] - cfg.cls_neg_thr) / \
+            (cfg.cls_pos_thr - cfg.cls_neg_thr)
+        # label weights
+        label_weights = (label >= 0).float()
+        # box regression target
+        reg_mask = pos_bboxes.new_zeros(ious.size(0)).long()
+        reg_mask[0:pos_gt_bboxes.size(0)] = 1
+        bbox_weights = (reg_mask > 0).float()
+        if reg_mask.bool().any():
+            pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach()
+            roi_center = pos_bboxes[..., 0:3]
+            roi_ry = pos_bboxes[..., 6] % (2 * np.pi)
+
+            # canonical transformation
+            pos_gt_bboxes_ct[..., 0:3] -= roi_center
+            pos_gt_bboxes_ct[..., 6] -= roi_ry
+            pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis(
+                pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -(roi_ry),
+                axis=2).squeeze(1)
+
+            # flip orientation if gt have opposite orientation
+            ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi)  # 0 ~ 2pi
+            is_opposite = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5)
+            ry_label[is_opposite] = (ry_label[is_opposite] + np.pi) % (
+                2 * np.pi)  # (0 ~ pi/2, 3pi/2 ~ 2pi)
+            flag = ry_label > np.pi
+            ry_label[flag] = ry_label[flag] - np.pi * 2  # (-pi/2, pi/2)
+            ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2)
+            pos_gt_bboxes_ct[..., 6] = ry_label
+
+            rois_anchor = pos_bboxes.clone().detach()
+            rois_anchor[:, 0:3] = 0
+            rois_anchor[:, 6] = 0
+            bbox_targets = self.bbox_coder.encode(rois_anchor,
+                                                  pos_gt_bboxes_ct)
+        else:
+            # no fg bbox
+            bbox_targets = pos_gt_bboxes.new_empty((0, 7))
+
+        return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
+                bbox_weights)
+
+    def get_bboxes(self,
+                   rois,
+                   cls_score,
+                   bbox_pred,
+                   class_labels,
+                   img_metas,
+                   cfg=None):
+        """Generate bboxes from bbox head predictions.
+
+        Args:
+            rois (torch.Tensor): RoI bounding boxes.
+            cls_score (torch.Tensor): Scores of bounding boxes.
+            bbox_pred (torch.Tensor): Bounding boxes predictions
+            class_labels (torch.Tensor): Label of classes
+            img_metas (list[dict]): Point cloud and image's meta info.
+            cfg (:obj:`ConfigDict`, optional): Testing config.
+                Defaults to None.
+
+        Returns:
+            list[tuple]: Decoded bbox, scores and labels after nms.
+        """
+        roi_batch_id = rois[..., 0]
+        roi_boxes = rois[..., 1:]  # boxes without batch id
+        batch_size = int(roi_batch_id.max().item() + 1)
+
+        # decode boxes
+        roi_ry = roi_boxes[..., 6].view(-1)
+        roi_xyz = roi_boxes[..., 0:3].view(-1, 3)
+        local_roi_boxes = roi_boxes.clone().detach()
+        local_roi_boxes[..., 0:3] = 0
+        rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred)
+        rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis(
+            rcnn_boxes3d[..., 0:3].unsqueeze(1), roi_ry, axis=2).squeeze(1)
+        rcnn_boxes3d[:, 0:3] += roi_xyz
+
+        # post processing
+        result_list = []
+        for batch_id in range(batch_size):
+            cur_class_labels = class_labels[batch_id]
+            cur_cls_score = cls_score[roi_batch_id == batch_id].view(-1)
+
+            cur_box_prob = cur_cls_score.unsqueeze(1)
+            cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id]
+            keep = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d,
+                                        cfg.score_thr, cfg.nms_thr,
+                                        img_metas[batch_id],
+                                        cfg.use_rotate_nms)
+            selected_bboxes = cur_rcnn_boxes3d[keep]
+            selected_label_preds = cur_class_labels[keep]
+            selected_scores = cur_cls_score[keep]
+
+            result_list.append(
+                (img_metas[batch_id]['box_type_3d'](selected_bboxes,
+                                                    self.bbox_coder.code_size),
+                 selected_scores, selected_label_preds))
+        return result_list
+
+    def multi_class_nms(self,
+                        box_probs,
+                        box_preds,
+                        score_thr,
+                        nms_thr,
+                        input_meta,
+                        use_rotate_nms=True):
+        """Multi-class NMS for box head.
+
+        Note:
+            This function has large overlap with the `box3d_multiclass_nms`
+            implemented in `mmdet3d.core.post_processing`. We are considering
+            merging these two functions in the future.
+
+        Args:
+            box_probs (torch.Tensor): Predicted boxes probabilities in
+                shape (N,).
+            box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C).
+            score_thr (float): Threshold of scores.
+            nms_thr (float): Threshold for NMS.
+            input_meta (dict): Meta information of the current sample.
+            use_rotate_nms (bool, optional): Whether to use rotated nms.
+                Defaults to True.
+
+        Returns:
+            torch.Tensor: Selected indices.
+        """
+        if use_rotate_nms:
+            nms_func = nms_bev
+        else:
+            nms_func = nms_normal_bev
+
+        assert box_probs.shape[
+            1] == self.num_classes, f'box_probs shape: {str(box_probs.shape)}'
+        selected_list = []
+        selected_labels = []
+        boxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            box_preds, self.bbox_coder.code_size).bev)
+
+        score_thresh = score_thr if isinstance(
+            score_thr, list) else [score_thr for x in range(self.num_classes)]
+        nms_thresh = nms_thr if isinstance(
+            nms_thr, list) else [nms_thr for x in range(self.num_classes)]
+        for k in range(0, self.num_classes):
+            class_scores_keep = box_probs[:, k] >= score_thresh[k]
+
+            if class_scores_keep.int().sum() > 0:
+                original_idxs = class_scores_keep.nonzero(
+                    as_tuple=False).view(-1)
+                cur_boxes_for_nms = boxes_for_nms[class_scores_keep]
+                cur_rank_scores = box_probs[class_scores_keep, k]
+
+                cur_selected = nms_func(cur_boxes_for_nms, cur_rank_scores,
+                                        nms_thresh[k])
+
+                if cur_selected.shape[0] == 0:
+                    continue
+                selected_list.append(original_idxs[cur_selected])
+                selected_labels.append(
+                    torch.full([cur_selected.shape[0]],
+                               k + 1,
+                               dtype=torch.int64,
+                               device=box_preds.device))
+
+        keep = torch.cat(
+            selected_list, dim=0) if len(selected_list) > 0 else []
+        return keep
diff --git a/mmdet3d/models/roi_heads/h3d_roi_head.py b/mmdet3d/models/roi_heads/h3d_roi_head.py
index b6b9597..2c3b12a 100644
--- a/mmdet3d/models/roi_heads/h3d_roi_head.py
+++ b/mmdet3d/models/roi_heads/h3d_roi_head.py
@@ -1,159 +1,159 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet3d.core.bbox import bbox3d2result
-from ..builder import HEADS, build_head
-from .base_3droi_head import Base3DRoIHead
-
-
-@HEADS.register_module()
-class H3DRoIHead(Base3DRoIHead):
-    """H3D roi head for H3DNet.
-
-    Args:
-        primitive_list (List): Configs of primitive heads.
-        bbox_head (ConfigDict): Config of bbox_head.
-        train_cfg (ConfigDict): Training config.
-        test_cfg (ConfigDict): Testing config.
-    """
-
-    def __init__(self,
-                 primitive_list,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(H3DRoIHead, self).__init__(
-            bbox_head=bbox_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            pretrained=pretrained,
-            init_cfg=init_cfg)
-        # Primitive module
-        assert len(primitive_list) == 3
-        self.primitive_z = build_head(primitive_list[0])
-        self.primitive_xy = build_head(primitive_list[1])
-        self.primitive_line = build_head(primitive_list[2])
-
-    def init_mask_head(self):
-        """Initialize mask head, skip since ``H3DROIHead`` does not have
-        one."""
-        pass
-
-    def init_bbox_head(self, bbox_head):
-        """Initialize box head."""
-        bbox_head['train_cfg'] = self.train_cfg
-        bbox_head['test_cfg'] = self.test_cfg
-        self.bbox_head = build_head(bbox_head)
-
-    def init_assigner_sampler(self):
-        """Initialize assigner and sampler."""
-        pass
-
-    def forward_train(self,
-                      feats_dict,
-                      img_metas,
-                      points,
-                      gt_bboxes_3d,
-                      gt_labels_3d,
-                      pts_semantic_mask,
-                      pts_instance_mask,
-                      gt_bboxes_ignore=None):
-        """Training forward function of PartAggregationROIHead.
-
-        Args:
-            feats_dict (dict): Contains features from the first stage.
-            img_metas (list[dict]): Contain pcd and img's meta info.
-            points (list[torch.Tensor]): Input points.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each sample.
-            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
-            pts_semantic_mask (list[torch.Tensor]): Point-wise
-                semantic mask.
-            pts_instance_mask (list[torch.Tensor]): Point-wise
-                instance mask.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding boxes to ignore.
-
-        Returns:
-            dict: losses from each head.
-        """
-        losses = dict()
-
-        sample_mod = self.train_cfg.sample_mod
-        assert sample_mod in ['vote', 'seed', 'random']
-        result_z = self.primitive_z(feats_dict, sample_mod)
-        feats_dict.update(result_z)
-
-        result_xy = self.primitive_xy(feats_dict, sample_mod)
-        feats_dict.update(result_xy)
-
-        result_line = self.primitive_line(feats_dict, sample_mod)
-        feats_dict.update(result_line)
-
-        primitive_loss_inputs = (feats_dict, points, gt_bboxes_3d,
-                                 gt_labels_3d, pts_semantic_mask,
-                                 pts_instance_mask, img_metas,
-                                 gt_bboxes_ignore)
-
-        loss_z = self.primitive_z.loss(*primitive_loss_inputs)
-        losses.update(loss_z)
-
-        loss_xy = self.primitive_xy.loss(*primitive_loss_inputs)
-        losses.update(loss_xy)
-
-        loss_line = self.primitive_line.loss(*primitive_loss_inputs)
-        losses.update(loss_line)
-
-        targets = feats_dict.pop('targets')
-
-        bbox_results = self.bbox_head(feats_dict, sample_mod)
-
-        feats_dict.update(bbox_results)
-        bbox_loss = self.bbox_head.loss(feats_dict, points, gt_bboxes_3d,
-                                        gt_labels_3d, pts_semantic_mask,
-                                        pts_instance_mask, img_metas, targets,
-                                        gt_bboxes_ignore)
-        losses.update(bbox_loss)
-
-        return losses
-
-    def simple_test(self, feats_dict, img_metas, points, rescale=False):
-        """Simple testing forward function of PartAggregationROIHead.
-
-        Note:
-            This function assumes that the batch size is 1
-
-        Args:
-            feats_dict (dict): Contains features from the first stage.
-            img_metas (list[dict]): Contain pcd and img's meta info.
-            points (torch.Tensor): Input points.
-            rescale (bool): Whether to rescale results.
-
-        Returns:
-            dict: Bbox results of one frame.
-        """
-        sample_mod = self.test_cfg.sample_mod
-        assert sample_mod in ['vote', 'seed', 'random']
-
-        result_z = self.primitive_z(feats_dict, sample_mod)
-        feats_dict.update(result_z)
-
-        result_xy = self.primitive_xy(feats_dict, sample_mod)
-        feats_dict.update(result_xy)
-
-        result_line = self.primitive_line(feats_dict, sample_mod)
-        feats_dict.update(result_line)
-
-        bbox_preds = self.bbox_head(feats_dict, sample_mod)
-        feats_dict.update(bbox_preds)
-        bbox_list = self.bbox_head.get_bboxes(
-            points,
-            feats_dict,
-            img_metas,
-            rescale=rescale,
-            suffix='_optimized')
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet3d.core.bbox import bbox3d2result
+from ..builder import HEADS, build_head
+from .base_3droi_head import Base3DRoIHead
+
+
+@HEADS.register_module()
+class H3DRoIHead(Base3DRoIHead):
+    """H3D roi head for H3DNet.
+
+    Args:
+        primitive_list (List): Configs of primitive heads.
+        bbox_head (ConfigDict): Config of bbox_head.
+        train_cfg (ConfigDict): Training config.
+        test_cfg (ConfigDict): Testing config.
+    """
+
+    def __init__(self,
+                 primitive_list,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(H3DRoIHead, self).__init__(
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+        # Primitive module
+        assert len(primitive_list) == 3
+        self.primitive_z = build_head(primitive_list[0])
+        self.primitive_xy = build_head(primitive_list[1])
+        self.primitive_line = build_head(primitive_list[2])
+
+    def init_mask_head(self):
+        """Initialize mask head, skip since ``H3DROIHead`` does not have
+        one."""
+        pass
+
+    def init_bbox_head(self, bbox_head):
+        """Initialize box head."""
+        bbox_head['train_cfg'] = self.train_cfg
+        bbox_head['test_cfg'] = self.test_cfg
+        self.bbox_head = build_head(bbox_head)
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        pass
+
+    def forward_train(self,
+                      feats_dict,
+                      img_metas,
+                      points,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      pts_semantic_mask,
+                      pts_instance_mask,
+                      gt_bboxes_ignore=None):
+        """Training forward function of PartAggregationROIHead.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            img_metas (list[dict]): Contain pcd and img's meta info.
+            points (list[torch.Tensor]): Input points.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each sample.
+            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
+            pts_semantic_mask (list[torch.Tensor]): Point-wise
+                semantic mask.
+            pts_instance_mask (list[torch.Tensor]): Point-wise
+                instance mask.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding boxes to ignore.
+
+        Returns:
+            dict: losses from each head.
+        """
+        losses = dict()
+
+        sample_mod = self.train_cfg.sample_mod
+        assert sample_mod in ['vote', 'seed', 'random']
+        result_z = self.primitive_z(feats_dict, sample_mod)
+        feats_dict.update(result_z)
+
+        result_xy = self.primitive_xy(feats_dict, sample_mod)
+        feats_dict.update(result_xy)
+
+        result_line = self.primitive_line(feats_dict, sample_mod)
+        feats_dict.update(result_line)
+
+        primitive_loss_inputs = (feats_dict, points, gt_bboxes_3d,
+                                 gt_labels_3d, pts_semantic_mask,
+                                 pts_instance_mask, img_metas,
+                                 gt_bboxes_ignore)
+
+        loss_z = self.primitive_z.loss(*primitive_loss_inputs)
+        losses.update(loss_z)
+
+        loss_xy = self.primitive_xy.loss(*primitive_loss_inputs)
+        losses.update(loss_xy)
+
+        loss_line = self.primitive_line.loss(*primitive_loss_inputs)
+        losses.update(loss_line)
+
+        targets = feats_dict.pop('targets')
+
+        bbox_results = self.bbox_head(feats_dict, sample_mod)
+
+        feats_dict.update(bbox_results)
+        bbox_loss = self.bbox_head.loss(feats_dict, points, gt_bboxes_3d,
+                                        gt_labels_3d, pts_semantic_mask,
+                                        pts_instance_mask, img_metas, targets,
+                                        gt_bboxes_ignore)
+        losses.update(bbox_loss)
+
+        return losses
+
+    def simple_test(self, feats_dict, img_metas, points, rescale=False):
+        """Simple testing forward function of PartAggregationROIHead.
+
+        Note:
+            This function assumes that the batch size is 1
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            img_metas (list[dict]): Contain pcd and img's meta info.
+            points (torch.Tensor): Input points.
+            rescale (bool): Whether to rescale results.
+
+        Returns:
+            dict: Bbox results of one frame.
+        """
+        sample_mod = self.test_cfg.sample_mod
+        assert sample_mod in ['vote', 'seed', 'random']
+
+        result_z = self.primitive_z(feats_dict, sample_mod)
+        feats_dict.update(result_z)
+
+        result_xy = self.primitive_xy(feats_dict, sample_mod)
+        feats_dict.update(result_xy)
+
+        result_line = self.primitive_line(feats_dict, sample_mod)
+        feats_dict.update(result_line)
+
+        bbox_preds = self.bbox_head(feats_dict, sample_mod)
+        feats_dict.update(bbox_preds)
+        bbox_list = self.bbox_head.get_bboxes(
+            points,
+            feats_dict,
+            img_metas,
+            rescale=rescale,
+            suffix='_optimized')
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
diff --git a/mmdet3d/models/roi_heads/mask_heads/__init__.py b/mmdet3d/models/roi_heads/mask_heads/__init__.py
index 0aa1156..a4b9f8f 100644
--- a/mmdet3d/models/roi_heads/mask_heads/__init__.py
+++ b/mmdet3d/models/roi_heads/mask_heads/__init__.py
@@ -1,5 +1,5 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .pointwise_semantic_head import PointwiseSemanticHead
-from .primitive_head import PrimitiveHead
-
-__all__ = ['PointwiseSemanticHead', 'PrimitiveHead']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .pointwise_semantic_head import PointwiseSemanticHead
+from .primitive_head import PrimitiveHead
+
+__all__ = ['PointwiseSemanticHead', 'PrimitiveHead']
diff --git a/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py b/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
index fc0bcf5..270914f 100644
--- a/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
+++ b/mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
@@ -1,202 +1,202 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.runner import BaseModule
-from torch import nn as nn
-from torch.nn import functional as F
-
-from mmdet3d.core.bbox.structures import rotation_3d_in_axis
-from mmdet3d.models.builder import HEADS, build_loss
-from mmdet.core import multi_apply
-
-
-@HEADS.register_module()
-class PointwiseSemanticHead(BaseModule):
-    """Semantic segmentation head for point-wise segmentation.
-
-    Predict point-wise segmentation and part regression results for PartA2.
-    See `paper <https://arxiv.org/abs/1907.03670>`_ for more details.
-
-    Args:
-        in_channels (int): The number of input channel.
-        num_classes (int): The number of class.
-        extra_width (float): Boxes enlarge width.
-        loss_seg (dict): Config of segmentation loss.
-        loss_part (dict): Config of part prediction loss.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 num_classes=3,
-                 extra_width=0.2,
-                 seg_score_thr=0.3,
-                 init_cfg=None,
-                 loss_seg=dict(
-                     type='FocalLoss',
-                     use_sigmoid=True,
-                     reduction='sum',
-                     gamma=2.0,
-                     alpha=0.25,
-                     loss_weight=1.0),
-                 loss_part=dict(
-                     type='CrossEntropyLoss',
-                     use_sigmoid=True,
-                     loss_weight=1.0)):
-        super(PointwiseSemanticHead, self).__init__(init_cfg=init_cfg)
-        self.extra_width = extra_width
-        self.num_classes = num_classes
-        self.seg_score_thr = seg_score_thr
-        self.seg_cls_layer = nn.Linear(in_channels, 1, bias=True)
-        self.seg_reg_layer = nn.Linear(in_channels, 3, bias=True)
-
-        self.loss_seg = build_loss(loss_seg)
-        self.loss_part = build_loss(loss_part)
-
-    def forward(self, x):
-        """Forward pass.
-
-        Args:
-            x (torch.Tensor): Features from the first stage.
-
-        Returns:
-            dict: Part features, segmentation and part predictions.
-
-                - seg_preds (torch.Tensor): Segment predictions.
-                - part_preds (torch.Tensor): Part predictions.
-                - part_feats (torch.Tensor): Feature predictions.
-        """
-        seg_preds = self.seg_cls_layer(x)  # (N, 1)
-        part_preds = self.seg_reg_layer(x)  # (N, 3)
-
-        seg_scores = torch.sigmoid(seg_preds).detach()
-        seg_mask = (seg_scores > self.seg_score_thr)
-
-        part_offsets = torch.sigmoid(part_preds).clone().detach()
-        part_offsets[seg_mask.view(-1) == 0] = 0
-        part_feats = torch.cat((part_offsets, seg_scores),
-                               dim=-1)  # shape (npoints, 4)
-        return dict(
-            seg_preds=seg_preds, part_preds=part_preds, part_feats=part_feats)
-
-    def get_targets_single(self, voxel_centers, gt_bboxes_3d, gt_labels_3d):
-        """generate segmentation and part prediction targets for a single
-        sample.
-
-        Args:
-            voxel_centers (torch.Tensor): The center of voxels in shape
-                (voxel_num, 3).
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in
-                shape (box_num, 7).
-            gt_labels_3d (torch.Tensor): Class labels of ground truths in
-                shape (box_num).
-
-        Returns:
-            tuple[torch.Tensor]: Segmentation targets with shape [voxel_num]
-                part prediction targets with shape [voxel_num, 3]
-        """
-        gt_bboxes_3d = gt_bboxes_3d.to(voxel_centers.device)
-        enlarged_gt_boxes = gt_bboxes_3d.enlarged_box(self.extra_width)
-
-        part_targets = voxel_centers.new_zeros((voxel_centers.shape[0], 3),
-                                               dtype=torch.float32)
-        box_idx = gt_bboxes_3d.points_in_boxes_part(voxel_centers)
-        enlarge_box_idx = enlarged_gt_boxes.points_in_boxes_part(
-            voxel_centers).long()
-
-        gt_labels_pad = F.pad(
-            gt_labels_3d, (1, 0), mode='constant', value=self.num_classes)
-        seg_targets = gt_labels_pad[(box_idx.long() + 1)]
-        fg_pt_flag = box_idx > -1
-        ignore_flag = fg_pt_flag ^ (enlarge_box_idx > -1)
-        seg_targets[ignore_flag] = -1
-
-        for k in range(len(gt_bboxes_3d)):
-            k_box_flag = box_idx == k
-            # no point in current box (caused by velodyne reduce)
-            if not k_box_flag.any():
-                continue
-            fg_voxels = voxel_centers[k_box_flag]
-            transformed_voxels = fg_voxels - gt_bboxes_3d.bottom_center[k]
-            transformed_voxels = rotation_3d_in_axis(
-                transformed_voxels.unsqueeze(0),
-                -gt_bboxes_3d.yaw[k].view(1),
-                axis=2)
-            part_targets[k_box_flag] = transformed_voxels / gt_bboxes_3d.dims[
-                k] + voxel_centers.new_tensor([0.5, 0.5, 0])
-
-        part_targets = torch.clamp(part_targets, min=0)
-        return seg_targets, part_targets
-
-    def get_targets(self, voxels_dict, gt_bboxes_3d, gt_labels_3d):
-        """generate segmentation and part prediction targets.
-
-        Args:
-            voxel_centers (torch.Tensor): The center of voxels in shape
-                (voxel_num, 3).
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in
-                shape (box_num, 7).
-            gt_labels_3d (torch.Tensor): Class labels of ground truths in
-                shape (box_num).
-
-        Returns:
-            dict: Prediction targets
-
-                - seg_targets (torch.Tensor): Segmentation targets
-                    with shape [voxel_num].
-                - part_targets (torch.Tensor): Part prediction targets
-                    with shape [voxel_num, 3].
-        """
-        batch_size = len(gt_labels_3d)
-        voxel_center_list = []
-        for idx in range(batch_size):
-            coords_idx = voxels_dict['coors'][:, 0] == idx
-            voxel_center_list.append(voxels_dict['voxel_centers'][coords_idx])
-
-        seg_targets, part_targets = multi_apply(self.get_targets_single,
-                                                voxel_center_list,
-                                                gt_bboxes_3d, gt_labels_3d)
-        seg_targets = torch.cat(seg_targets, dim=0)
-        part_targets = torch.cat(part_targets, dim=0)
-        return dict(seg_targets=seg_targets, part_targets=part_targets)
-
-    def loss(self, semantic_results, semantic_targets):
-        """Calculate point-wise segmentation and part prediction losses.
-
-        Args:
-            semantic_results (dict): Results from semantic head.
-
-                - seg_preds: Segmentation predictions.
-                - part_preds: Part predictions.
-
-            semantic_targets (dict): Targets of semantic results.
-
-                - seg_preds: Segmentation targets.
-                - part_preds: Part targets.
-
-        Returns:
-            dict: Loss of segmentation and part prediction.
-
-                - loss_seg (torch.Tensor): Segmentation prediction loss.
-                - loss_part (torch.Tensor): Part prediction loss.
-        """
-        seg_preds = semantic_results['seg_preds']
-        part_preds = semantic_results['part_preds']
-        seg_targets = semantic_targets['seg_targets']
-        part_targets = semantic_targets['part_targets']
-
-        pos_mask = (seg_targets > -1) & (seg_targets < self.num_classes)
-        binary_seg_target = pos_mask.long()
-        pos = pos_mask.float()
-        neg = (seg_targets == self.num_classes).float()
-        seg_weights = pos + neg
-        pos_normalizer = pos.sum()
-        seg_weights = seg_weights / torch.clamp(pos_normalizer, min=1.0)
-        loss_seg = self.loss_seg(seg_preds, binary_seg_target, seg_weights)
-
-        if pos_normalizer > 0:
-            loss_part = self.loss_part(part_preds[pos_mask],
-                                       part_targets[pos_mask])
-        else:
-            # fake a part loss
-            loss_part = loss_seg.new_tensor(0)
-
-        return dict(loss_seg=loss_seg, loss_part=loss_part)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import BaseModule
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.core.bbox.structures import rotation_3d_in_axis
+from mmdet3d.models.builder import HEADS, build_loss
+from mmdet.core import multi_apply
+
+
+@HEADS.register_module()
+class PointwiseSemanticHead(BaseModule):
+    """Semantic segmentation head for point-wise segmentation.
+
+    Predict point-wise segmentation and part regression results for PartA2.
+    See `paper <https://arxiv.org/abs/1907.03670>`_ for more details.
+
+    Args:
+        in_channels (int): The number of input channel.
+        num_classes (int): The number of class.
+        extra_width (float): Boxes enlarge width.
+        loss_seg (dict): Config of segmentation loss.
+        loss_part (dict): Config of part prediction loss.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_classes=3,
+                 extra_width=0.2,
+                 seg_score_thr=0.3,
+                 init_cfg=None,
+                 loss_seg=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     reduction='sum',
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_part=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0)):
+        super(PointwiseSemanticHead, self).__init__(init_cfg=init_cfg)
+        self.extra_width = extra_width
+        self.num_classes = num_classes
+        self.seg_score_thr = seg_score_thr
+        self.seg_cls_layer = nn.Linear(in_channels, 1, bias=True)
+        self.seg_reg_layer = nn.Linear(in_channels, 3, bias=True)
+
+        self.loss_seg = build_loss(loss_seg)
+        self.loss_part = build_loss(loss_part)
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (torch.Tensor): Features from the first stage.
+
+        Returns:
+            dict: Part features, segmentation and part predictions.
+
+                - seg_preds (torch.Tensor): Segment predictions.
+                - part_preds (torch.Tensor): Part predictions.
+                - part_feats (torch.Tensor): Feature predictions.
+        """
+        seg_preds = self.seg_cls_layer(x)  # (N, 1)
+        part_preds = self.seg_reg_layer(x)  # (N, 3)
+
+        seg_scores = torch.sigmoid(seg_preds).detach()
+        seg_mask = (seg_scores > self.seg_score_thr)
+
+        part_offsets = torch.sigmoid(part_preds).clone().detach()
+        part_offsets[seg_mask.view(-1) == 0] = 0
+        part_feats = torch.cat((part_offsets, seg_scores),
+                               dim=-1)  # shape (npoints, 4)
+        return dict(
+            seg_preds=seg_preds, part_preds=part_preds, part_feats=part_feats)
+
+    def get_targets_single(self, voxel_centers, gt_bboxes_3d, gt_labels_3d):
+        """generate segmentation and part prediction targets for a single
+        sample.
+
+        Args:
+            voxel_centers (torch.Tensor): The center of voxels in shape
+                (voxel_num, 3).
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in
+                shape (box_num, 7).
+            gt_labels_3d (torch.Tensor): Class labels of ground truths in
+                shape (box_num).
+
+        Returns:
+            tuple[torch.Tensor]: Segmentation targets with shape [voxel_num]
+                part prediction targets with shape [voxel_num, 3]
+        """
+        gt_bboxes_3d = gt_bboxes_3d.to(voxel_centers.device)
+        enlarged_gt_boxes = gt_bboxes_3d.enlarged_box(self.extra_width)
+
+        part_targets = voxel_centers.new_zeros((voxel_centers.shape[0], 3),
+                                               dtype=torch.float32)
+        box_idx = gt_bboxes_3d.points_in_boxes_part(voxel_centers)
+        enlarge_box_idx = enlarged_gt_boxes.points_in_boxes_part(
+            voxel_centers).long()
+
+        gt_labels_pad = F.pad(
+            gt_labels_3d, (1, 0), mode='constant', value=self.num_classes)
+        seg_targets = gt_labels_pad[(box_idx.long() + 1)]
+        fg_pt_flag = box_idx > -1
+        ignore_flag = fg_pt_flag ^ (enlarge_box_idx > -1)
+        seg_targets[ignore_flag] = -1
+
+        for k in range(len(gt_bboxes_3d)):
+            k_box_flag = box_idx == k
+            # no point in current box (caused by velodyne reduce)
+            if not k_box_flag.any():
+                continue
+            fg_voxels = voxel_centers[k_box_flag]
+            transformed_voxels = fg_voxels - gt_bboxes_3d.bottom_center[k]
+            transformed_voxels = rotation_3d_in_axis(
+                transformed_voxels.unsqueeze(0),
+                -gt_bboxes_3d.yaw[k].view(1),
+                axis=2)
+            part_targets[k_box_flag] = transformed_voxels / gt_bboxes_3d.dims[
+                k] + voxel_centers.new_tensor([0.5, 0.5, 0])
+
+        part_targets = torch.clamp(part_targets, min=0)
+        return seg_targets, part_targets
+
+    def get_targets(self, voxels_dict, gt_bboxes_3d, gt_labels_3d):
+        """generate segmentation and part prediction targets.
+
+        Args:
+            voxel_centers (torch.Tensor): The center of voxels in shape
+                (voxel_num, 3).
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in
+                shape (box_num, 7).
+            gt_labels_3d (torch.Tensor): Class labels of ground truths in
+                shape (box_num).
+
+        Returns:
+            dict: Prediction targets
+
+                - seg_targets (torch.Tensor): Segmentation targets
+                    with shape [voxel_num].
+                - part_targets (torch.Tensor): Part prediction targets
+                    with shape [voxel_num, 3].
+        """
+        batch_size = len(gt_labels_3d)
+        voxel_center_list = []
+        for idx in range(batch_size):
+            coords_idx = voxels_dict['coors'][:, 0] == idx
+            voxel_center_list.append(voxels_dict['voxel_centers'][coords_idx])
+
+        seg_targets, part_targets = multi_apply(self.get_targets_single,
+                                                voxel_center_list,
+                                                gt_bboxes_3d, gt_labels_3d)
+        seg_targets = torch.cat(seg_targets, dim=0)
+        part_targets = torch.cat(part_targets, dim=0)
+        return dict(seg_targets=seg_targets, part_targets=part_targets)
+
+    def loss(self, semantic_results, semantic_targets):
+        """Calculate point-wise segmentation and part prediction losses.
+
+        Args:
+            semantic_results (dict): Results from semantic head.
+
+                - seg_preds: Segmentation predictions.
+                - part_preds: Part predictions.
+
+            semantic_targets (dict): Targets of semantic results.
+
+                - seg_preds: Segmentation targets.
+                - part_preds: Part targets.
+
+        Returns:
+            dict: Loss of segmentation and part prediction.
+
+                - loss_seg (torch.Tensor): Segmentation prediction loss.
+                - loss_part (torch.Tensor): Part prediction loss.
+        """
+        seg_preds = semantic_results['seg_preds']
+        part_preds = semantic_results['part_preds']
+        seg_targets = semantic_targets['seg_targets']
+        part_targets = semantic_targets['part_targets']
+
+        pos_mask = (seg_targets > -1) & (seg_targets < self.num_classes)
+        binary_seg_target = pos_mask.long()
+        pos = pos_mask.float()
+        neg = (seg_targets == self.num_classes).float()
+        seg_weights = pos + neg
+        pos_normalizer = pos.sum()
+        seg_weights = seg_weights / torch.clamp(pos_normalizer, min=1.0)
+        loss_seg = self.loss_seg(seg_preds, binary_seg_target, seg_weights)
+
+        if pos_normalizer > 0:
+            loss_part = self.loss_part(part_preds[pos_mask],
+                                       part_targets[pos_mask])
+        else:
+            # fake a part loss
+            loss_part = loss_seg.new_tensor(0)
+
+        return dict(loss_seg=loss_seg, loss_part=loss_part)
diff --git a/mmdet3d/models/roi_heads/mask_heads/primitive_head.py b/mmdet3d/models/roi_heads/mask_heads/primitive_head.py
index 4c9c28b..c8eee25 100644
--- a/mmdet3d/models/roi_heads/mask_heads/primitive_head.py
+++ b/mmdet3d/models/roi_heads/mask_heads/primitive_head.py
@@ -1,966 +1,966 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import ConvModule
-from mmcv.ops import furthest_point_sample
-from mmcv.runner import BaseModule
-from torch import nn as nn
-from torch.nn import functional as F
-
-from mmdet3d.models.builder import HEADS, build_loss
-from mmdet3d.models.model_utils import VoteModule
-from mmdet3d.ops import build_sa_module
-from mmdet.core import multi_apply
-
-
-@HEADS.register_module()
-class PrimitiveHead(BaseModule):
-    r"""Primitive head of `H3DNet <https://arxiv.org/abs/2006.05682>`_.
-
-    Args:
-        num_dims (int): The dimension of primitive semantic information.
-        num_classes (int): The number of class.
-        primitive_mode (str): The mode of primitive module,
-            available mode ['z', 'xy', 'line'].
-        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
-            decoding boxes.
-        train_cfg (dict): Config for training.
-        test_cfg (dict): Config for testing.
-        vote_module_cfg (dict): Config of VoteModule for point-wise votes.
-        vote_aggregation_cfg (dict): Config of vote aggregation layer.
-        feat_channels (tuple[int]): Convolution channels of
-            prediction layer.
-        upper_thresh (float): Threshold for line matching.
-        surface_thresh (float): Threshold for surface matching.
-        conv_cfg (dict): Config of convolution in prediction layer.
-        norm_cfg (dict): Config of BN in prediction layer.
-        objectness_loss (dict): Config of objectness loss.
-        center_loss (dict): Config of center loss.
-        semantic_loss (dict): Config of point-wise semantic segmentation loss.
-    """
-
-    def __init__(self,
-                 num_dims,
-                 num_classes,
-                 primitive_mode,
-                 train_cfg=None,
-                 test_cfg=None,
-                 vote_module_cfg=None,
-                 vote_aggregation_cfg=None,
-                 feat_channels=(128, 128),
-                 upper_thresh=100.0,
-                 surface_thresh=0.5,
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 objectness_loss=None,
-                 center_loss=None,
-                 semantic_reg_loss=None,
-                 semantic_cls_loss=None,
-                 init_cfg=None):
-        super(PrimitiveHead, self).__init__(init_cfg=init_cfg)
-        assert primitive_mode in ['z', 'xy', 'line']
-        # The dimension of primitive semantic information.
-        self.num_dims = num_dims
-        self.num_classes = num_classes
-        self.primitive_mode = primitive_mode
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        self.gt_per_seed = vote_module_cfg['gt_per_seed']
-        self.num_proposal = vote_aggregation_cfg['num_point']
-        self.upper_thresh = upper_thresh
-        self.surface_thresh = surface_thresh
-
-        self.objectness_loss = build_loss(objectness_loss)
-        self.center_loss = build_loss(center_loss)
-        self.semantic_reg_loss = build_loss(semantic_reg_loss)
-        self.semantic_cls_loss = build_loss(semantic_cls_loss)
-
-        assert vote_aggregation_cfg['mlp_channels'][0] == vote_module_cfg[
-            'in_channels']
-
-        # Primitive existence flag prediction
-        self.flag_conv = ConvModule(
-            vote_module_cfg['conv_channels'][-1],
-            vote_module_cfg['conv_channels'][-1] // 2,
-            1,
-            padding=0,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg,
-            bias=True,
-            inplace=True)
-        self.flag_pred = torch.nn.Conv1d(
-            vote_module_cfg['conv_channels'][-1] // 2, 2, 1)
-
-        self.vote_module = VoteModule(**vote_module_cfg)
-        self.vote_aggregation = build_sa_module(vote_aggregation_cfg)
-
-        prev_channel = vote_aggregation_cfg['mlp_channels'][-1]
-        conv_pred_list = list()
-        for k in range(len(feat_channels)):
-            conv_pred_list.append(
-                ConvModule(
-                    prev_channel,
-                    feat_channels[k],
-                    1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    bias=True,
-                    inplace=True))
-            prev_channel = feat_channels[k]
-        self.conv_pred = nn.Sequential(*conv_pred_list)
-
-        conv_out_channel = 3 + num_dims + num_classes
-        self.conv_pred.add_module('conv_out',
-                                  nn.Conv1d(prev_channel, conv_out_channel, 1))
-
-    def forward(self, feats_dict, sample_mod):
-        """Forward pass.
-
-        Args:
-            feats_dict (dict): Feature dict from backbone.
-            sample_mod (str): Sample mode for vote aggregation layer.
-                valid modes are "vote", "seed" and "random".
-
-        Returns:
-            dict: Predictions of primitive head.
-        """
-        assert sample_mod in ['vote', 'seed', 'random']
-
-        seed_points = feats_dict['fp_xyz_net0'][-1]
-        seed_features = feats_dict['hd_feature']
-        results = {}
-
-        primitive_flag = self.flag_conv(seed_features)
-        primitive_flag = self.flag_pred(primitive_flag)
-
-        results['pred_flag_' + self.primitive_mode] = primitive_flag
-
-        # 1. generate vote_points from seed_points
-        vote_points, vote_features, _ = self.vote_module(
-            seed_points, seed_features)
-        results['vote_' + self.primitive_mode] = vote_points
-        results['vote_features_' + self.primitive_mode] = vote_features
-
-        # 2. aggregate vote_points
-        if sample_mod == 'vote':
-            # use fps in vote_aggregation
-            sample_indices = None
-        elif sample_mod == 'seed':
-            # FPS on seed and choose the votes corresponding to the seeds
-            sample_indices = furthest_point_sample(seed_points,
-                                                   self.num_proposal)
-        elif sample_mod == 'random':
-            # Random sampling from the votes
-            batch_size, num_seed = seed_points.shape[:2]
-            sample_indices = torch.randint(
-                0,
-                num_seed, (batch_size, self.num_proposal),
-                dtype=torch.int32,
-                device=seed_points.device)
-        else:
-            raise NotImplementedError('Unsupported sample mod!')
-
-        vote_aggregation_ret = self.vote_aggregation(vote_points,
-                                                     vote_features,
-                                                     sample_indices)
-        aggregated_points, features, aggregated_indices = vote_aggregation_ret
-        results['aggregated_points_' + self.primitive_mode] = aggregated_points
-        results['aggregated_features_' + self.primitive_mode] = features
-        results['aggregated_indices_' +
-                self.primitive_mode] = aggregated_indices
-
-        # 3. predict primitive offsets and semantic information
-        predictions = self.conv_pred(features)
-
-        # 4. decode predictions
-        decode_ret = self.primitive_decode_scores(predictions,
-                                                  aggregated_points)
-        results.update(decode_ret)
-
-        center, pred_ind = self.get_primitive_center(
-            primitive_flag, decode_ret['center_' + self.primitive_mode])
-
-        results['pred_' + self.primitive_mode + '_ind'] = pred_ind
-        results['pred_' + self.primitive_mode + '_center'] = center
-        return results
-
-    def loss(self,
-             bbox_preds,
-             points,
-             gt_bboxes_3d,
-             gt_labels_3d,
-             pts_semantic_mask=None,
-             pts_instance_mask=None,
-             img_metas=None,
-             gt_bboxes_ignore=None):
-        """Compute loss.
-
-        Args:
-            bbox_preds (dict): Predictions from forward of primitive head.
-            points (list[torch.Tensor]): Input points.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each sample.
-            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
-            pts_semantic_mask (list[torch.Tensor]): Point-wise
-                semantic mask.
-            pts_instance_mask (list[torch.Tensor]): Point-wise
-                instance mask.
-            img_metas (list[dict]): Contain pcd and img's meta info.
-            gt_bboxes_ignore (list[torch.Tensor]): Specify
-                which bounding.
-
-        Returns:
-            dict: Losses of Primitive Head.
-        """
-        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
-                                   pts_semantic_mask, pts_instance_mask,
-                                   bbox_preds)
-
-        (point_mask, point_offset, gt_primitive_center, gt_primitive_semantic,
-         gt_sem_cls_label, gt_primitive_mask) = targets
-
-        losses = {}
-        # Compute the loss of primitive existence flag
-        pred_flag = bbox_preds['pred_flag_' + self.primitive_mode]
-        flag_loss = self.objectness_loss(pred_flag, gt_primitive_mask.long())
-        losses['flag_loss_' + self.primitive_mode] = flag_loss
-
-        # calculate vote loss
-        vote_loss = self.vote_module.get_loss(
-            bbox_preds['seed_points'],
-            bbox_preds['vote_' + self.primitive_mode],
-            bbox_preds['seed_indices'], point_mask, point_offset)
-        losses['vote_loss_' + self.primitive_mode] = vote_loss
-
-        num_proposal = bbox_preds['aggregated_points_' +
-                                  self.primitive_mode].shape[1]
-        primitive_center = bbox_preds['center_' + self.primitive_mode]
-        if self.primitive_mode != 'line':
-            primitive_semantic = bbox_preds['size_residuals_' +
-                                            self.primitive_mode].contiguous()
-        else:
-            primitive_semantic = None
-        semancitc_scores = bbox_preds['sem_cls_scores_' +
-                                      self.primitive_mode].transpose(2, 1)
-
-        gt_primitive_mask = gt_primitive_mask / \
-            (gt_primitive_mask.sum() + 1e-6)
-        center_loss, size_loss, sem_cls_loss = self.compute_primitive_loss(
-            primitive_center, primitive_semantic, semancitc_scores,
-            num_proposal, gt_primitive_center, gt_primitive_semantic,
-            gt_sem_cls_label, gt_primitive_mask)
-        losses['center_loss_' + self.primitive_mode] = center_loss
-        losses['size_loss_' + self.primitive_mode] = size_loss
-        losses['sem_loss_' + self.primitive_mode] = sem_cls_loss
-
-        return losses
-
-    def get_targets(self,
-                    points,
-                    gt_bboxes_3d,
-                    gt_labels_3d,
-                    pts_semantic_mask=None,
-                    pts_instance_mask=None,
-                    bbox_preds=None):
-        """Generate targets of primitive head.
-
-        Args:
-            points (list[torch.Tensor]): Points of each batch.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                bboxes of each batch.
-            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
-            pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
-                label of each batch.
-            pts_instance_mask (list[torch.Tensor]): Point-wise instance
-                label of each batch.
-            bbox_preds (dict): Predictions from forward of primitive head.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of primitive head.
-        """
-        for index in range(len(gt_labels_3d)):
-            if len(gt_labels_3d[index]) == 0:
-                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
-                    1, gt_bboxes_3d[index].tensor.shape[-1])
-                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
-                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
-
-        if pts_semantic_mask is None:
-            pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
-            pts_instance_mask = [None for i in range(len(gt_labels_3d))]
-
-        (point_mask, point_sem,
-         point_offset) = multi_apply(self.get_targets_single, points,
-                                     gt_bboxes_3d, gt_labels_3d,
-                                     pts_semantic_mask, pts_instance_mask)
-
-        point_mask = torch.stack(point_mask)
-        point_sem = torch.stack(point_sem)
-        point_offset = torch.stack(point_offset)
-
-        batch_size = point_mask.shape[0]
-        num_proposal = bbox_preds['aggregated_points_' +
-                                  self.primitive_mode].shape[1]
-        num_seed = bbox_preds['seed_points'].shape[1]
-        seed_inds = bbox_preds['seed_indices'].long()
-        seed_inds_expand = seed_inds.view(batch_size, num_seed,
-                                          1).repeat(1, 1, 3)
-        seed_gt_votes = torch.gather(point_offset, 1, seed_inds_expand)
-        seed_gt_votes += bbox_preds['seed_points']
-        gt_primitive_center = seed_gt_votes.view(batch_size * num_proposal, 1,
-                                                 3)
-
-        seed_inds_expand_sem = seed_inds.view(batch_size, num_seed, 1).repeat(
-            1, 1, 4 + self.num_dims)
-        seed_gt_sem = torch.gather(point_sem, 1, seed_inds_expand_sem)
-        gt_primitive_semantic = seed_gt_sem[:, :, 3:3 + self.num_dims].view(
-            batch_size * num_proposal, 1, self.num_dims).contiguous()
-
-        gt_sem_cls_label = seed_gt_sem[:, :, -1].long()
-
-        gt_votes_mask = torch.gather(point_mask, 1, seed_inds)
-
-        return (point_mask, point_offset, gt_primitive_center,
-                gt_primitive_semantic, gt_sem_cls_label, gt_votes_mask)
-
-    def get_targets_single(self,
-                           points,
-                           gt_bboxes_3d,
-                           gt_labels_3d,
-                           pts_semantic_mask=None,
-                           pts_instance_mask=None):
-        """Generate targets of primitive head for single batch.
-
-        Args:
-            points (torch.Tensor): Points of each batch.
-            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
-                boxes of each batch.
-            gt_labels_3d (torch.Tensor): Labels of each batch.
-            pts_semantic_mask (torch.Tensor): Point-wise semantic
-                label of each batch.
-            pts_instance_mask (torch.Tensor): Point-wise instance
-                label of each batch.
-
-        Returns:
-            tuple[torch.Tensor]: Targets of primitive head.
-        """
-        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
-        num_points = points.shape[0]
-
-        point_mask = points.new_zeros(num_points)
-        # Offset to the primitive center
-        point_offset = points.new_zeros([num_points, 3])
-        # Semantic information of primitive center
-        point_sem = points.new_zeros([num_points, 3 + self.num_dims + 1])
-
-        # Generate pts_semantic_mask and pts_instance_mask when they are None
-        if pts_semantic_mask is None or pts_instance_mask is None:
-            points2box_mask = gt_bboxes_3d.points_in_boxes_all(points)
-            assignment = points2box_mask.argmax(1)
-            background_mask = points2box_mask.max(1)[0] == 0
-
-            if pts_semantic_mask is None:
-                pts_semantic_mask = gt_labels_3d[assignment]
-                pts_semantic_mask[background_mask] = self.num_classes
-
-            if pts_instance_mask is None:
-                pts_instance_mask = assignment
-                pts_instance_mask[background_mask] = gt_labels_3d.shape[0]
-
-        instance_flag = torch.nonzero(
-            pts_semantic_mask != self.num_classes, as_tuple=False).squeeze(1)
-        instance_labels = pts_instance_mask[instance_flag].unique()
-
-        with_yaw = gt_bboxes_3d.with_yaw
-        for i, i_instance in enumerate(instance_labels):
-            indices = instance_flag[pts_instance_mask[instance_flag] ==
-                                    i_instance]
-            coords = points[indices, :3]
-            cur_cls_label = pts_semantic_mask[indices][0]
-
-            # Bbox Corners
-            cur_corners = gt_bboxes_3d.corners[i]
-
-            plane_lower_temp = points.new_tensor(
-                [0, 0, 1, -cur_corners[7, -1]])
-            upper_points = cur_corners[[1, 2, 5, 6]]
-            refined_distance = (upper_points * plane_lower_temp[:3]).sum(dim=1)
-
-            if self.check_horizon(upper_points) and \
-                    plane_lower_temp[0] + plane_lower_temp[1] < \
-                    self.train_cfg['lower_thresh']:
-                plane_lower = points.new_tensor(
-                    [0, 0, 1, plane_lower_temp[-1]])
-                plane_upper = points.new_tensor(
-                    [0, 0, 1, -torch.mean(refined_distance)])
-            else:
-                raise NotImplementedError('Only horizontal plane is support!')
-
-            if self.check_dist(plane_upper, upper_points) is False:
-                raise NotImplementedError(
-                    'Mean distance to plane should be lower than thresh!')
-
-            # Get the boundary points here
-            point2plane_dist, selected = self.match_point2plane(
-                plane_lower, coords)
-
-            # Get bottom four lines
-            if self.primitive_mode == 'line':
-                point2line_matching = self.match_point2line(
-                    coords[selected], cur_corners, with_yaw, mode='bottom')
-
-                point_mask, point_offset, point_sem = \
-                    self._assign_primitive_line_targets(point_mask,
-                                                        point_offset,
-                                                        point_sem,
-                                                        coords[selected],
-                                                        indices[selected],
-                                                        cur_cls_label,
-                                                        point2line_matching,
-                                                        cur_corners,
-                                                        [1, 1, 0, 0],
-                                                        with_yaw,
-                                                        mode='bottom')
-
-            # Set the surface labels here
-            if self.primitive_mode == 'z' and \
-                    selected.sum() > self.train_cfg['num_point'] and \
-                    point2plane_dist[selected].var() < \
-                    self.train_cfg['var_thresh']:
-
-                point_mask, point_offset, point_sem = \
-                    self._assign_primitive_surface_targets(point_mask,
-                                                           point_offset,
-                                                           point_sem,
-                                                           coords[selected],
-                                                           indices[selected],
-                                                           cur_cls_label,
-                                                           cur_corners,
-                                                           with_yaw,
-                                                           mode='bottom')
-
-            # Get the boundary points here
-            point2plane_dist, selected = self.match_point2plane(
-                plane_upper, coords)
-
-            # Get top four lines
-            if self.primitive_mode == 'line':
-                point2line_matching = self.match_point2line(
-                    coords[selected], cur_corners, with_yaw, mode='top')
-
-                point_mask, point_offset, point_sem = \
-                    self._assign_primitive_line_targets(point_mask,
-                                                        point_offset,
-                                                        point_sem,
-                                                        coords[selected],
-                                                        indices[selected],
-                                                        cur_cls_label,
-                                                        point2line_matching,
-                                                        cur_corners,
-                                                        [1, 1, 0, 0],
-                                                        with_yaw,
-                                                        mode='top')
-
-            if self.primitive_mode == 'z' and \
-                    selected.sum() > self.train_cfg['num_point'] and \
-                    point2plane_dist[selected].var() < \
-                    self.train_cfg['var_thresh']:
-
-                point_mask, point_offset, point_sem = \
-                    self._assign_primitive_surface_targets(point_mask,
-                                                           point_offset,
-                                                           point_sem,
-                                                           coords[selected],
-                                                           indices[selected],
-                                                           cur_cls_label,
-                                                           cur_corners,
-                                                           with_yaw,
-                                                           mode='top')
-
-            # Get left two lines
-            plane_left_temp = self._get_plane_fomulation(
-                cur_corners[2] - cur_corners[3],
-                cur_corners[3] - cur_corners[0], cur_corners[0])
-
-            right_points = cur_corners[[4, 5, 7, 6]]
-            plane_left_temp /= torch.norm(plane_left_temp[:3])
-            refined_distance = (right_points * plane_left_temp[:3]).sum(dim=1)
-
-            if plane_left_temp[2] < self.train_cfg['lower_thresh']:
-                plane_left = plane_left_temp
-                plane_right = points.new_tensor([
-                    plane_left_temp[0], plane_left_temp[1], plane_left_temp[2],
-                    -refined_distance.mean()
-                ])
-            else:
-                raise NotImplementedError(
-                    'Normal vector of the plane should be horizontal!')
-
-            # Get the boundary points here
-            point2plane_dist, selected = self.match_point2plane(
-                plane_left, coords)
-
-            # Get left four lines
-            if self.primitive_mode == 'line':
-                point2line_matching = self.match_point2line(
-                    coords[selected], cur_corners, with_yaw, mode='left')
-                point_mask, point_offset, point_sem = \
-                    self._assign_primitive_line_targets(
-                        point_mask, point_offset, point_sem,
-                        coords[selected], indices[selected], cur_cls_label,
-                        point2line_matching[2:], cur_corners, [2, 2],
-                        with_yaw, mode='left')
-
-            if self.primitive_mode == 'xy' and \
-                    selected.sum() > self.train_cfg['num_point'] and \
-                    point2plane_dist[selected].var() < \
-                    self.train_cfg['var_thresh']:
-
-                point_mask, point_offset, point_sem = \
-                    self._assign_primitive_surface_targets(
-                        point_mask, point_offset, point_sem,
-                        coords[selected], indices[selected], cur_cls_label,
-                        cur_corners, with_yaw, mode='left')
-
-            # Get the boundary points here
-            point2plane_dist, selected = self.match_point2plane(
-                plane_right, coords)
-
-            # Get right four lines
-            if self.primitive_mode == 'line':
-                point2line_matching = self.match_point2line(
-                    coords[selected], cur_corners, with_yaw, mode='right')
-
-                point_mask, point_offset, point_sem = \
-                    self._assign_primitive_line_targets(
-                        point_mask, point_offset, point_sem,
-                        coords[selected], indices[selected], cur_cls_label,
-                        point2line_matching[2:], cur_corners, [2, 2],
-                        with_yaw, mode='right')
-
-            if self.primitive_mode == 'xy' and \
-                    selected.sum() > self.train_cfg['num_point'] and \
-                    point2plane_dist[selected].var() < \
-                    self.train_cfg['var_thresh']:
-
-                point_mask, point_offset, point_sem = \
-                    self._assign_primitive_surface_targets(
-                        point_mask, point_offset, point_sem,
-                        coords[selected], indices[selected], cur_cls_label,
-                        cur_corners, with_yaw, mode='right')
-
-            plane_front_temp = self._get_plane_fomulation(
-                cur_corners[0] - cur_corners[4],
-                cur_corners[4] - cur_corners[5], cur_corners[5])
-
-            back_points = cur_corners[[3, 2, 7, 6]]
-            plane_front_temp /= torch.norm(plane_front_temp[:3])
-            refined_distance = (back_points * plane_front_temp[:3]).sum(dim=1)
-
-            if plane_front_temp[2] < self.train_cfg['lower_thresh']:
-                plane_front = plane_front_temp
-                plane_back = points.new_tensor([
-                    plane_front_temp[0], plane_front_temp[1],
-                    plane_front_temp[2], -torch.mean(refined_distance)
-                ])
-            else:
-                raise NotImplementedError(
-                    'Normal vector of the plane should be horizontal!')
-
-            # Get the boundary points here
-            point2plane_dist, selected = self.match_point2plane(
-                plane_front, coords)
-
-            if self.primitive_mode == 'xy' and \
-                    selected.sum() > self.train_cfg['num_point'] and \
-                    (point2plane_dist[selected]).var() < \
-                    self.train_cfg['var_thresh']:
-
-                point_mask, point_offset, point_sem = \
-                    self._assign_primitive_surface_targets(
-                        point_mask, point_offset, point_sem,
-                        coords[selected], indices[selected], cur_cls_label,
-                        cur_corners, with_yaw, mode='front')
-
-            # Get the boundary points here
-            point2plane_dist, selected = self.match_point2plane(
-                plane_back, coords)
-
-            if self.primitive_mode == 'xy' and \
-                    selected.sum() > self.train_cfg['num_point'] and \
-                    point2plane_dist[selected].var() < \
-                    self.train_cfg['var_thresh']:
-
-                point_mask, point_offset, point_sem = \
-                    self._assign_primitive_surface_targets(
-                        point_mask, point_offset, point_sem,
-                        coords[selected], indices[selected], cur_cls_label,
-                        cur_corners, with_yaw, mode='back')
-
-        return (point_mask, point_sem, point_offset)
-
-    def primitive_decode_scores(self, predictions, aggregated_points):
-        """Decode predicted parts to primitive head.
-
-        Args:
-            predictions (torch.Tensor): primitive pridictions of each batch.
-            aggregated_points (torch.Tensor): The aggregated points
-                of vote stage.
-
-        Returns:
-            Dict: Predictions of primitive head, including center,
-                semantic size and semantic scores.
-        """
-
-        ret_dict = {}
-        pred_transposed = predictions.transpose(2, 1)
-
-        center = aggregated_points + pred_transposed[:, :, 0:3]
-        ret_dict['center_' + self.primitive_mode] = center
-
-        if self.primitive_mode in ['z', 'xy']:
-            ret_dict['size_residuals_' + self.primitive_mode] = \
-                pred_transposed[:, :, 3:3 + self.num_dims]
-
-        ret_dict['sem_cls_scores_' + self.primitive_mode] = \
-            pred_transposed[:, :, 3 + self.num_dims:]
-
-        return ret_dict
-
-    def check_horizon(self, points):
-        """Check whether is a horizontal plane.
-
-        Args:
-            points (torch.Tensor): Points of input.
-
-        Returns:
-            Bool: Flag of result.
-        """
-        return (points[0][-1] == points[1][-1]) and \
-               (points[1][-1] == points[2][-1]) and \
-               (points[2][-1] == points[3][-1])
-
-    def check_dist(self, plane_equ, points):
-        """Whether the mean of points to plane distance is lower than thresh.
-
-        Args:
-            plane_equ (torch.Tensor): Plane to be checked.
-            points (torch.Tensor): Points to be checked.
-
-        Returns:
-            Tuple: Flag of result.
-        """
-        return (points[:, 2] +
-                plane_equ[-1]).sum() / 4.0 < self.train_cfg['lower_thresh']
-
-    def point2line_dist(self, points, pts_a, pts_b):
-        """Calculate the distance from point to line.
-
-        Args:
-            points (torch.Tensor): Points of input.
-            pts_a (torch.Tensor): Point on the specific line.
-            pts_b (torch.Tensor): Point on the specific line.
-
-        Returns:
-            torch.Tensor: Distance between each point to line.
-        """
-        line_a2b = pts_b - pts_a
-        line_a2pts = points - pts_a
-        length = (line_a2pts * line_a2b.view(1, 3)).sum(1) / \
-            line_a2b.norm()
-        dist = (line_a2pts.norm(dim=1)**2 - length**2).sqrt()
-
-        return dist
-
-    def match_point2line(self, points, corners, with_yaw, mode='bottom'):
-        """Match points to corresponding line.
-
-        Args:
-            points (torch.Tensor): Points of input.
-            corners (torch.Tensor): Eight corners of a bounding box.
-            with_yaw (Bool): Whether the boundind box is with rotation.
-            mode (str, optional): Specify which line should be matched,
-                available mode are ('bottom', 'top', 'left', 'right').
-                Defaults to 'bottom'.
-
-        Returns:
-            Tuple: Flag of matching correspondence.
-        """
-        if with_yaw:
-            corners_pair = {
-                'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]],
-                'top': [[1, 2], [5, 6], [1, 5], [2, 6]],
-                'left': [[0, 1], [3, 2], [0, 1], [3, 2]],
-                'right': [[4, 5], [7, 6], [4, 5], [7, 6]]
-            }
-            selected_list = []
-            for pair_index in corners_pair[mode]:
-                selected = self.point2line_dist(
-                    points, corners[pair_index[0]], corners[pair_index[1]]) \
-                    < self.train_cfg['line_thresh']
-                selected_list.append(selected)
-        else:
-            xmin, ymin, _ = corners.min(0)[0]
-            xmax, ymax, _ = corners.max(0)[0]
-            sel1 = torch.abs(points[:, 0] -
-                             xmin) < self.train_cfg['line_thresh']
-            sel2 = torch.abs(points[:, 0] -
-                             xmax) < self.train_cfg['line_thresh']
-            sel3 = torch.abs(points[:, 1] -
-                             ymin) < self.train_cfg['line_thresh']
-            sel4 = torch.abs(points[:, 1] -
-                             ymax) < self.train_cfg['line_thresh']
-            selected_list = [sel1, sel2, sel3, sel4]
-        return selected_list
-
-    def match_point2plane(self, plane, points):
-        """Match points to plane.
-
-        Args:
-            plane (torch.Tensor): Equation of the plane.
-            points (torch.Tensor): Points of input.
-
-        Returns:
-            Tuple: Distance of each point to the plane and
-                flag of matching correspondence.
-        """
-        point2plane_dist = torch.abs((points * plane[:3]).sum(dim=1) +
-                                     plane[-1])
-        min_dist = point2plane_dist.min()
-        selected = torch.abs(point2plane_dist -
-                             min_dist) < self.train_cfg['dist_thresh']
-        return point2plane_dist, selected
-
-    def compute_primitive_loss(self, primitive_center, primitive_semantic,
-                               semantic_scores, num_proposal,
-                               gt_primitive_center, gt_primitive_semantic,
-                               gt_sem_cls_label, gt_primitive_mask):
-        """Compute loss of primitive module.
-
-        Args:
-            primitive_center (torch.Tensor): Pridictions of primitive center.
-            primitive_semantic (torch.Tensor): Pridictions of primitive
-                semantic.
-            semantic_scores (torch.Tensor): Pridictions of primitive
-                semantic scores.
-            num_proposal (int): The number of primitive proposal.
-            gt_primitive_center (torch.Tensor): Ground truth of
-                primitive center.
-            gt_votes_sem (torch.Tensor): Ground truth of primitive semantic.
-            gt_sem_cls_label (torch.Tensor): Ground truth of primitive
-                semantic class.
-            gt_primitive_mask (torch.Tensor): Ground truth of primitive mask.
-
-        Returns:
-            Tuple: Loss of primitive module.
-        """
-        batch_size = primitive_center.shape[0]
-        vote_xyz_reshape = primitive_center.view(batch_size * num_proposal, -1,
-                                                 3)
-
-        center_loss = self.center_loss(
-            vote_xyz_reshape,
-            gt_primitive_center,
-            dst_weight=gt_primitive_mask.view(batch_size * num_proposal, 1))[1]
-
-        if self.primitive_mode != 'line':
-            size_xyz_reshape = primitive_semantic.view(
-                batch_size * num_proposal, -1, self.num_dims).contiguous()
-            size_loss = self.semantic_reg_loss(
-                size_xyz_reshape,
-                gt_primitive_semantic,
-                dst_weight=gt_primitive_mask.view(batch_size * num_proposal,
-                                                  1))[1]
-        else:
-            size_loss = center_loss.new_tensor(0.0)
-
-        # Semantic cls loss
-        sem_cls_loss = self.semantic_cls_loss(
-            semantic_scores, gt_sem_cls_label, weight=gt_primitive_mask)
-
-        return center_loss, size_loss, sem_cls_loss
-
-    def get_primitive_center(self, pred_flag, center):
-        """Generate primitive center from predictions.
-
-        Args:
-            pred_flag (torch.Tensor): Scores of primitive center.
-            center (torch.Tensor): Pridictions of primitive center.
-
-        Returns:
-            Tuple: Primitive center and the prediction indices.
-        """
-        ind_normal = F.softmax(pred_flag, dim=1)
-        pred_indices = (ind_normal[:, 1, :] >
-                        self.surface_thresh).detach().float()
-        selected = (ind_normal[:, 1, :] <=
-                    self.surface_thresh).detach().float()
-        offset = torch.ones_like(center) * self.upper_thresh
-        center = center + offset * selected.unsqueeze(-1)
-        return center, pred_indices
-
-    def _assign_primitive_line_targets(self,
-                                       point_mask,
-                                       point_offset,
-                                       point_sem,
-                                       coords,
-                                       indices,
-                                       cls_label,
-                                       point2line_matching,
-                                       corners,
-                                       center_axises,
-                                       with_yaw,
-                                       mode='bottom'):
-        """Generate targets of line primitive.
-
-        Args:
-            point_mask (torch.Tensor): Tensor to store the ground
-                truth of mask.
-            point_offset (torch.Tensor): Tensor to store the ground
-                truth of offset.
-            point_sem (torch.Tensor): Tensor to store the ground
-                truth of semantic.
-            coords (torch.Tensor): The selected points.
-            indices (torch.Tensor): Indices of the selected points.
-            cls_label (int): Class label of the ground truth bounding box.
-            point2line_matching (torch.Tensor): Flag indicate that
-                matching line of each point.
-            corners (torch.Tensor): Corners of the ground truth bounding box.
-            center_axises (list[int]): Indicate in which axis the line center
-                should be refined.
-            with_yaw (Bool): Whether the boundind box is with rotation.
-            mode (str, optional): Specify which line should be matched,
-                available mode are ('bottom', 'top', 'left', 'right').
-                Defaults to 'bottom'.
-
-        Returns:
-            Tuple: Targets of the line primitive.
-        """
-        corners_pair = {
-            'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]],
-            'top': [[1, 2], [5, 6], [1, 5], [2, 6]],
-            'left': [[0, 1], [3, 2]],
-            'right': [[4, 5], [7, 6]]
-        }
-        corners_pair = corners_pair[mode]
-        assert len(corners_pair) == len(point2line_matching) == len(
-            center_axises)
-        for line_select, center_axis, pair_index in zip(
-                point2line_matching, center_axises, corners_pair):
-            if line_select.sum() > self.train_cfg['num_point_line']:
-                point_mask[indices[line_select]] = 1.0
-
-                if with_yaw:
-                    line_center = (corners[pair_index[0]] +
-                                   corners[pair_index[1]]) / 2
-                else:
-                    line_center = coords[line_select].mean(dim=0)
-                    line_center[center_axis] = corners[:, center_axis].mean()
-
-                point_offset[indices[line_select]] = \
-                    line_center - coords[line_select]
-                point_sem[indices[line_select]] = \
-                    point_sem.new_tensor([line_center[0], line_center[1],
-                                          line_center[2], cls_label])
-        return point_mask, point_offset, point_sem
-
-    def _assign_primitive_surface_targets(self,
-                                          point_mask,
-                                          point_offset,
-                                          point_sem,
-                                          coords,
-                                          indices,
-                                          cls_label,
-                                          corners,
-                                          with_yaw,
-                                          mode='bottom'):
-        """Generate targets for primitive z and primitive xy.
-
-        Args:
-            point_mask (torch.Tensor): Tensor to store the ground
-                truth of mask.
-            point_offset (torch.Tensor): Tensor to store the ground
-                truth of offset.
-            point_sem (torch.Tensor): Tensor to store the ground
-                truth of semantic.
-            coords (torch.Tensor): The selected points.
-            indices (torch.Tensor): Indices of the selected points.
-            cls_label (int): Class label of the ground truth bounding box.
-            corners (torch.Tensor): Corners of the ground truth bounding box.
-            with_yaw (Bool): Whether the boundind box is with rotation.
-            mode (str, optional): Specify which line should be matched,
-                available mode are ('bottom', 'top', 'left', 'right',
-                'front', 'back').
-                Defaults to 'bottom'.
-
-        Returns:
-            Tuple: Targets of the center primitive.
-        """
-        point_mask[indices] = 1.0
-        corners_pair = {
-            'bottom': [0, 7],
-            'top': [1, 6],
-            'left': [0, 1],
-            'right': [4, 5],
-            'front': [0, 1],
-            'back': [3, 2]
-        }
-        pair_index = corners_pair[mode]
-        if self.primitive_mode == 'z':
-            if with_yaw:
-                center = (corners[pair_index[0]] +
-                          corners[pair_index[1]]) / 2.0
-                center[2] = coords[:, 2].mean()
-                point_sem[indices] = point_sem.new_tensor([
-                    center[0], center[1],
-                    center[2], (corners[4] - corners[0]).norm(),
-                    (corners[3] - corners[0]).norm(), cls_label
-                ])
-            else:
-                center = point_mask.new_tensor([
-                    corners[:, 0].mean(), corners[:, 1].mean(),
-                    coords[:, 2].mean()
-                ])
-                point_sem[indices] = point_sem.new_tensor([
-                    center[0], center[1], center[2],
-                    corners[:, 0].max() - corners[:, 0].min(),
-                    corners[:, 1].max() - corners[:, 1].min(), cls_label
-                ])
-        elif self.primitive_mode == 'xy':
-            if with_yaw:
-                center = coords.mean(0)
-                center[2] = (corners[pair_index[0], 2] +
-                             corners[pair_index[1], 2]) / 2.0
-                point_sem[indices] = point_sem.new_tensor([
-                    center[0], center[1], center[2],
-                    corners[pair_index[1], 2] - corners[pair_index[0], 2],
-                    cls_label
-                ])
-            else:
-                center = point_mask.new_tensor([
-                    coords[:, 0].mean(), coords[:, 1].mean(),
-                    corners[:, 2].mean()
-                ])
-                point_sem[indices] = point_sem.new_tensor([
-                    center[0], center[1], center[2],
-                    corners[:, 2].max() - corners[:, 2].min(), cls_label
-                ])
-        point_offset[indices] = center - coords
-        return point_mask, point_offset, point_sem
-
-    def _get_plane_fomulation(self, vector1, vector2, point):
-        """Compute the equation of the plane.
-
-        Args:
-            vector1 (torch.Tensor): Parallel vector of the plane.
-            vector2 (torch.Tensor): Parallel vector of the plane.
-            point (torch.Tensor): Point on the plane.
-
-        Returns:
-            torch.Tensor: Equation of the plane.
-        """
-        surface_norm = torch.cross(vector1, vector2)
-        surface_dis = -torch.dot(surface_norm, point)
-        plane = point.new_tensor(
-            [surface_norm[0], surface_norm[1], surface_norm[2], surface_dis])
-        return plane
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.ops import furthest_point_sample
+from mmcv.runner import BaseModule
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.models.builder import HEADS, build_loss
+from mmdet3d.models.model_utils import VoteModule
+from mmdet3d.ops import build_sa_module
+from mmdet.core import multi_apply
+
+
+@HEADS.register_module()
+class PrimitiveHead(BaseModule):
+    r"""Primitive head of `H3DNet <https://arxiv.org/abs/2006.05682>`_.
+
+    Args:
+        num_dims (int): The dimension of primitive semantic information.
+        num_classes (int): The number of class.
+        primitive_mode (str): The mode of primitive module,
+            available mode ['z', 'xy', 'line'].
+        bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
+            decoding boxes.
+        train_cfg (dict): Config for training.
+        test_cfg (dict): Config for testing.
+        vote_module_cfg (dict): Config of VoteModule for point-wise votes.
+        vote_aggregation_cfg (dict): Config of vote aggregation layer.
+        feat_channels (tuple[int]): Convolution channels of
+            prediction layer.
+        upper_thresh (float): Threshold for line matching.
+        surface_thresh (float): Threshold for surface matching.
+        conv_cfg (dict): Config of convolution in prediction layer.
+        norm_cfg (dict): Config of BN in prediction layer.
+        objectness_loss (dict): Config of objectness loss.
+        center_loss (dict): Config of center loss.
+        semantic_loss (dict): Config of point-wise semantic segmentation loss.
+    """
+
+    def __init__(self,
+                 num_dims,
+                 num_classes,
+                 primitive_mode,
+                 train_cfg=None,
+                 test_cfg=None,
+                 vote_module_cfg=None,
+                 vote_aggregation_cfg=None,
+                 feat_channels=(128, 128),
+                 upper_thresh=100.0,
+                 surface_thresh=0.5,
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 objectness_loss=None,
+                 center_loss=None,
+                 semantic_reg_loss=None,
+                 semantic_cls_loss=None,
+                 init_cfg=None):
+        super(PrimitiveHead, self).__init__(init_cfg=init_cfg)
+        assert primitive_mode in ['z', 'xy', 'line']
+        # The dimension of primitive semantic information.
+        self.num_dims = num_dims
+        self.num_classes = num_classes
+        self.primitive_mode = primitive_mode
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.gt_per_seed = vote_module_cfg['gt_per_seed']
+        self.num_proposal = vote_aggregation_cfg['num_point']
+        self.upper_thresh = upper_thresh
+        self.surface_thresh = surface_thresh
+
+        self.objectness_loss = build_loss(objectness_loss)
+        self.center_loss = build_loss(center_loss)
+        self.semantic_reg_loss = build_loss(semantic_reg_loss)
+        self.semantic_cls_loss = build_loss(semantic_cls_loss)
+
+        assert vote_aggregation_cfg['mlp_channels'][0] == vote_module_cfg[
+            'in_channels']
+
+        # Primitive existence flag prediction
+        self.flag_conv = ConvModule(
+            vote_module_cfg['conv_channels'][-1],
+            vote_module_cfg['conv_channels'][-1] // 2,
+            1,
+            padding=0,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True,
+            inplace=True)
+        self.flag_pred = torch.nn.Conv1d(
+            vote_module_cfg['conv_channels'][-1] // 2, 2, 1)
+
+        self.vote_module = VoteModule(**vote_module_cfg)
+        self.vote_aggregation = build_sa_module(vote_aggregation_cfg)
+
+        prev_channel = vote_aggregation_cfg['mlp_channels'][-1]
+        conv_pred_list = list()
+        for k in range(len(feat_channels)):
+            conv_pred_list.append(
+                ConvModule(
+                    prev_channel,
+                    feat_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    bias=True,
+                    inplace=True))
+            prev_channel = feat_channels[k]
+        self.conv_pred = nn.Sequential(*conv_pred_list)
+
+        conv_out_channel = 3 + num_dims + num_classes
+        self.conv_pred.add_module('conv_out',
+                                  nn.Conv1d(prev_channel, conv_out_channel, 1))
+
+    def forward(self, feats_dict, sample_mod):
+        """Forward pass.
+
+        Args:
+            feats_dict (dict): Feature dict from backbone.
+            sample_mod (str): Sample mode for vote aggregation layer.
+                valid modes are "vote", "seed" and "random".
+
+        Returns:
+            dict: Predictions of primitive head.
+        """
+        assert sample_mod in ['vote', 'seed', 'random']
+
+        seed_points = feats_dict['fp_xyz_net0'][-1]
+        seed_features = feats_dict['hd_feature']
+        results = {}
+
+        primitive_flag = self.flag_conv(seed_features)
+        primitive_flag = self.flag_pred(primitive_flag)
+
+        results['pred_flag_' + self.primitive_mode] = primitive_flag
+
+        # 1. generate vote_points from seed_points
+        vote_points, vote_features, _ = self.vote_module(
+            seed_points, seed_features)
+        results['vote_' + self.primitive_mode] = vote_points
+        results['vote_features_' + self.primitive_mode] = vote_features
+
+        # 2. aggregate vote_points
+        if sample_mod == 'vote':
+            # use fps in vote_aggregation
+            sample_indices = None
+        elif sample_mod == 'seed':
+            # FPS on seed and choose the votes corresponding to the seeds
+            sample_indices = furthest_point_sample(seed_points,
+                                                   self.num_proposal)
+        elif sample_mod == 'random':
+            # Random sampling from the votes
+            batch_size, num_seed = seed_points.shape[:2]
+            sample_indices = torch.randint(
+                0,
+                num_seed, (batch_size, self.num_proposal),
+                dtype=torch.int32,
+                device=seed_points.device)
+        else:
+            raise NotImplementedError('Unsupported sample mod!')
+
+        vote_aggregation_ret = self.vote_aggregation(vote_points,
+                                                     vote_features,
+                                                     sample_indices)
+        aggregated_points, features, aggregated_indices = vote_aggregation_ret
+        results['aggregated_points_' + self.primitive_mode] = aggregated_points
+        results['aggregated_features_' + self.primitive_mode] = features
+        results['aggregated_indices_' +
+                self.primitive_mode] = aggregated_indices
+
+        # 3. predict primitive offsets and semantic information
+        predictions = self.conv_pred(features)
+
+        # 4. decode predictions
+        decode_ret = self.primitive_decode_scores(predictions,
+                                                  aggregated_points)
+        results.update(decode_ret)
+
+        center, pred_ind = self.get_primitive_center(
+            primitive_flag, decode_ret['center_' + self.primitive_mode])
+
+        results['pred_' + self.primitive_mode + '_ind'] = pred_ind
+        results['pred_' + self.primitive_mode + '_center'] = center
+        return results
+
+    def loss(self,
+             bbox_preds,
+             points,
+             gt_bboxes_3d,
+             gt_labels_3d,
+             pts_semantic_mask=None,
+             pts_instance_mask=None,
+             img_metas=None,
+             gt_bboxes_ignore=None):
+        """Compute loss.
+
+        Args:
+            bbox_preds (dict): Predictions from forward of primitive head.
+            points (list[torch.Tensor]): Input points.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each sample.
+            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
+            pts_semantic_mask (list[torch.Tensor]): Point-wise
+                semantic mask.
+            pts_instance_mask (list[torch.Tensor]): Point-wise
+                instance mask.
+            img_metas (list[dict]): Contain pcd and img's meta info.
+            gt_bboxes_ignore (list[torch.Tensor]): Specify
+                which bounding.
+
+        Returns:
+            dict: Losses of Primitive Head.
+        """
+        targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
+                                   pts_semantic_mask, pts_instance_mask,
+                                   bbox_preds)
+
+        (point_mask, point_offset, gt_primitive_center, gt_primitive_semantic,
+         gt_sem_cls_label, gt_primitive_mask) = targets
+
+        losses = {}
+        # Compute the loss of primitive existence flag
+        pred_flag = bbox_preds['pred_flag_' + self.primitive_mode]
+        flag_loss = self.objectness_loss(pred_flag, gt_primitive_mask.long())
+        losses['flag_loss_' + self.primitive_mode] = flag_loss
+
+        # calculate vote loss
+        vote_loss = self.vote_module.get_loss(
+            bbox_preds['seed_points'],
+            bbox_preds['vote_' + self.primitive_mode],
+            bbox_preds['seed_indices'], point_mask, point_offset)
+        losses['vote_loss_' + self.primitive_mode] = vote_loss
+
+        num_proposal = bbox_preds['aggregated_points_' +
+                                  self.primitive_mode].shape[1]
+        primitive_center = bbox_preds['center_' + self.primitive_mode]
+        if self.primitive_mode != 'line':
+            primitive_semantic = bbox_preds['size_residuals_' +
+                                            self.primitive_mode].contiguous()
+        else:
+            primitive_semantic = None
+        semancitc_scores = bbox_preds['sem_cls_scores_' +
+                                      self.primitive_mode].transpose(2, 1)
+
+        gt_primitive_mask = gt_primitive_mask / \
+            (gt_primitive_mask.sum() + 1e-6)
+        center_loss, size_loss, sem_cls_loss = self.compute_primitive_loss(
+            primitive_center, primitive_semantic, semancitc_scores,
+            num_proposal, gt_primitive_center, gt_primitive_semantic,
+            gt_sem_cls_label, gt_primitive_mask)
+        losses['center_loss_' + self.primitive_mode] = center_loss
+        losses['size_loss_' + self.primitive_mode] = size_loss
+        losses['sem_loss_' + self.primitive_mode] = sem_cls_loss
+
+        return losses
+
+    def get_targets(self,
+                    points,
+                    gt_bboxes_3d,
+                    gt_labels_3d,
+                    pts_semantic_mask=None,
+                    pts_instance_mask=None,
+                    bbox_preds=None):
+        """Generate targets of primitive head.
+
+        Args:
+            points (list[torch.Tensor]): Points of each batch.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each batch.
+            gt_labels_3d (list[torch.Tensor]): Labels of each batch.
+            pts_semantic_mask (list[torch.Tensor]): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (list[torch.Tensor]): Point-wise instance
+                label of each batch.
+            bbox_preds (dict): Predictions from forward of primitive head.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of primitive head.
+        """
+        for index in range(len(gt_labels_3d)):
+            if len(gt_labels_3d[index]) == 0:
+                fake_box = gt_bboxes_3d[index].tensor.new_zeros(
+                    1, gt_bboxes_3d[index].tensor.shape[-1])
+                gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
+                gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
+
+        if pts_semantic_mask is None:
+            pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
+            pts_instance_mask = [None for i in range(len(gt_labels_3d))]
+
+        (point_mask, point_sem,
+         point_offset) = multi_apply(self.get_targets_single, points,
+                                     gt_bboxes_3d, gt_labels_3d,
+                                     pts_semantic_mask, pts_instance_mask)
+
+        point_mask = torch.stack(point_mask)
+        point_sem = torch.stack(point_sem)
+        point_offset = torch.stack(point_offset)
+
+        batch_size = point_mask.shape[0]
+        num_proposal = bbox_preds['aggregated_points_' +
+                                  self.primitive_mode].shape[1]
+        num_seed = bbox_preds['seed_points'].shape[1]
+        seed_inds = bbox_preds['seed_indices'].long()
+        seed_inds_expand = seed_inds.view(batch_size, num_seed,
+                                          1).repeat(1, 1, 3)
+        seed_gt_votes = torch.gather(point_offset, 1, seed_inds_expand)
+        seed_gt_votes += bbox_preds['seed_points']
+        gt_primitive_center = seed_gt_votes.view(batch_size * num_proposal, 1,
+                                                 3)
+
+        seed_inds_expand_sem = seed_inds.view(batch_size, num_seed, 1).repeat(
+            1, 1, 4 + self.num_dims)
+        seed_gt_sem = torch.gather(point_sem, 1, seed_inds_expand_sem)
+        gt_primitive_semantic = seed_gt_sem[:, :, 3:3 + self.num_dims].view(
+            batch_size * num_proposal, 1, self.num_dims).contiguous()
+
+        gt_sem_cls_label = seed_gt_sem[:, :, -1].long()
+
+        gt_votes_mask = torch.gather(point_mask, 1, seed_inds)
+
+        return (point_mask, point_offset, gt_primitive_center,
+                gt_primitive_semantic, gt_sem_cls_label, gt_votes_mask)
+
+    def get_targets_single(self,
+                           points,
+                           gt_bboxes_3d,
+                           gt_labels_3d,
+                           pts_semantic_mask=None,
+                           pts_instance_mask=None):
+        """Generate targets of primitive head for single batch.
+
+        Args:
+            points (torch.Tensor): Points of each batch.
+            gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth
+                boxes of each batch.
+            gt_labels_3d (torch.Tensor): Labels of each batch.
+            pts_semantic_mask (torch.Tensor): Point-wise semantic
+                label of each batch.
+            pts_instance_mask (torch.Tensor): Point-wise instance
+                label of each batch.
+
+        Returns:
+            tuple[torch.Tensor]: Targets of primitive head.
+        """
+        gt_bboxes_3d = gt_bboxes_3d.to(points.device)
+        num_points = points.shape[0]
+
+        point_mask = points.new_zeros(num_points)
+        # Offset to the primitive center
+        point_offset = points.new_zeros([num_points, 3])
+        # Semantic information of primitive center
+        point_sem = points.new_zeros([num_points, 3 + self.num_dims + 1])
+
+        # Generate pts_semantic_mask and pts_instance_mask when they are None
+        if pts_semantic_mask is None or pts_instance_mask is None:
+            points2box_mask = gt_bboxes_3d.points_in_boxes_all(points)
+            assignment = points2box_mask.argmax(1)
+            background_mask = points2box_mask.max(1)[0] == 0
+
+            if pts_semantic_mask is None:
+                pts_semantic_mask = gt_labels_3d[assignment]
+                pts_semantic_mask[background_mask] = self.num_classes
+
+            if pts_instance_mask is None:
+                pts_instance_mask = assignment
+                pts_instance_mask[background_mask] = gt_labels_3d.shape[0]
+
+        instance_flag = torch.nonzero(
+            pts_semantic_mask != self.num_classes, as_tuple=False).squeeze(1)
+        instance_labels = pts_instance_mask[instance_flag].unique()
+
+        with_yaw = gt_bboxes_3d.with_yaw
+        for i, i_instance in enumerate(instance_labels):
+            indices = instance_flag[pts_instance_mask[instance_flag] ==
+                                    i_instance]
+            coords = points[indices, :3]
+            cur_cls_label = pts_semantic_mask[indices][0]
+
+            # Bbox Corners
+            cur_corners = gt_bboxes_3d.corners[i]
+
+            plane_lower_temp = points.new_tensor(
+                [0, 0, 1, -cur_corners[7, -1]])
+            upper_points = cur_corners[[1, 2, 5, 6]]
+            refined_distance = (upper_points * plane_lower_temp[:3]).sum(dim=1)
+
+            if self.check_horizon(upper_points) and \
+                    plane_lower_temp[0] + plane_lower_temp[1] < \
+                    self.train_cfg['lower_thresh']:
+                plane_lower = points.new_tensor(
+                    [0, 0, 1, plane_lower_temp[-1]])
+                plane_upper = points.new_tensor(
+                    [0, 0, 1, -torch.mean(refined_distance)])
+            else:
+                raise NotImplementedError('Only horizontal plane is support!')
+
+            if self.check_dist(plane_upper, upper_points) is False:
+                raise NotImplementedError(
+                    'Mean distance to plane should be lower than thresh!')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_lower, coords)
+
+            # Get bottom four lines
+            if self.primitive_mode == 'line':
+                point2line_matching = self.match_point2line(
+                    coords[selected], cur_corners, with_yaw, mode='bottom')
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_line_targets(point_mask,
+                                                        point_offset,
+                                                        point_sem,
+                                                        coords[selected],
+                                                        indices[selected],
+                                                        cur_cls_label,
+                                                        point2line_matching,
+                                                        cur_corners,
+                                                        [1, 1, 0, 0],
+                                                        with_yaw,
+                                                        mode='bottom')
+
+            # Set the surface labels here
+            if self.primitive_mode == 'z' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    point2plane_dist[selected].var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(point_mask,
+                                                           point_offset,
+                                                           point_sem,
+                                                           coords[selected],
+                                                           indices[selected],
+                                                           cur_cls_label,
+                                                           cur_corners,
+                                                           with_yaw,
+                                                           mode='bottom')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_upper, coords)
+
+            # Get top four lines
+            if self.primitive_mode == 'line':
+                point2line_matching = self.match_point2line(
+                    coords[selected], cur_corners, with_yaw, mode='top')
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_line_targets(point_mask,
+                                                        point_offset,
+                                                        point_sem,
+                                                        coords[selected],
+                                                        indices[selected],
+                                                        cur_cls_label,
+                                                        point2line_matching,
+                                                        cur_corners,
+                                                        [1, 1, 0, 0],
+                                                        with_yaw,
+                                                        mode='top')
+
+            if self.primitive_mode == 'z' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    point2plane_dist[selected].var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(point_mask,
+                                                           point_offset,
+                                                           point_sem,
+                                                           coords[selected],
+                                                           indices[selected],
+                                                           cur_cls_label,
+                                                           cur_corners,
+                                                           with_yaw,
+                                                           mode='top')
+
+            # Get left two lines
+            plane_left_temp = self._get_plane_fomulation(
+                cur_corners[2] - cur_corners[3],
+                cur_corners[3] - cur_corners[0], cur_corners[0])
+
+            right_points = cur_corners[[4, 5, 7, 6]]
+            plane_left_temp /= torch.norm(plane_left_temp[:3])
+            refined_distance = (right_points * plane_left_temp[:3]).sum(dim=1)
+
+            if plane_left_temp[2] < self.train_cfg['lower_thresh']:
+                plane_left = plane_left_temp
+                plane_right = points.new_tensor([
+                    plane_left_temp[0], plane_left_temp[1], plane_left_temp[2],
+                    -refined_distance.mean()
+                ])
+            else:
+                raise NotImplementedError(
+                    'Normal vector of the plane should be horizontal!')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_left, coords)
+
+            # Get left four lines
+            if self.primitive_mode == 'line':
+                point2line_matching = self.match_point2line(
+                    coords[selected], cur_corners, with_yaw, mode='left')
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_line_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        point2line_matching[2:], cur_corners, [2, 2],
+                        with_yaw, mode='left')
+
+            if self.primitive_mode == 'xy' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    point2plane_dist[selected].var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        cur_corners, with_yaw, mode='left')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_right, coords)
+
+            # Get right four lines
+            if self.primitive_mode == 'line':
+                point2line_matching = self.match_point2line(
+                    coords[selected], cur_corners, with_yaw, mode='right')
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_line_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        point2line_matching[2:], cur_corners, [2, 2],
+                        with_yaw, mode='right')
+
+            if self.primitive_mode == 'xy' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    point2plane_dist[selected].var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        cur_corners, with_yaw, mode='right')
+
+            plane_front_temp = self._get_plane_fomulation(
+                cur_corners[0] - cur_corners[4],
+                cur_corners[4] - cur_corners[5], cur_corners[5])
+
+            back_points = cur_corners[[3, 2, 7, 6]]
+            plane_front_temp /= torch.norm(plane_front_temp[:3])
+            refined_distance = (back_points * plane_front_temp[:3]).sum(dim=1)
+
+            if plane_front_temp[2] < self.train_cfg['lower_thresh']:
+                plane_front = plane_front_temp
+                plane_back = points.new_tensor([
+                    plane_front_temp[0], plane_front_temp[1],
+                    plane_front_temp[2], -torch.mean(refined_distance)
+                ])
+            else:
+                raise NotImplementedError(
+                    'Normal vector of the plane should be horizontal!')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_front, coords)
+
+            if self.primitive_mode == 'xy' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    (point2plane_dist[selected]).var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        cur_corners, with_yaw, mode='front')
+
+            # Get the boundary points here
+            point2plane_dist, selected = self.match_point2plane(
+                plane_back, coords)
+
+            if self.primitive_mode == 'xy' and \
+                    selected.sum() > self.train_cfg['num_point'] and \
+                    point2plane_dist[selected].var() < \
+                    self.train_cfg['var_thresh']:
+
+                point_mask, point_offset, point_sem = \
+                    self._assign_primitive_surface_targets(
+                        point_mask, point_offset, point_sem,
+                        coords[selected], indices[selected], cur_cls_label,
+                        cur_corners, with_yaw, mode='back')
+
+        return (point_mask, point_sem, point_offset)
+
+    def primitive_decode_scores(self, predictions, aggregated_points):
+        """Decode predicted parts to primitive head.
+
+        Args:
+            predictions (torch.Tensor): primitive pridictions of each batch.
+            aggregated_points (torch.Tensor): The aggregated points
+                of vote stage.
+
+        Returns:
+            Dict: Predictions of primitive head, including center,
+                semantic size and semantic scores.
+        """
+
+        ret_dict = {}
+        pred_transposed = predictions.transpose(2, 1)
+
+        center = aggregated_points + pred_transposed[:, :, 0:3]
+        ret_dict['center_' + self.primitive_mode] = center
+
+        if self.primitive_mode in ['z', 'xy']:
+            ret_dict['size_residuals_' + self.primitive_mode] = \
+                pred_transposed[:, :, 3:3 + self.num_dims]
+
+        ret_dict['sem_cls_scores_' + self.primitive_mode] = \
+            pred_transposed[:, :, 3 + self.num_dims:]
+
+        return ret_dict
+
+    def check_horizon(self, points):
+        """Check whether is a horizontal plane.
+
+        Args:
+            points (torch.Tensor): Points of input.
+
+        Returns:
+            Bool: Flag of result.
+        """
+        return (points[0][-1] == points[1][-1]) and \
+               (points[1][-1] == points[2][-1]) and \
+               (points[2][-1] == points[3][-1])
+
+    def check_dist(self, plane_equ, points):
+        """Whether the mean of points to plane distance is lower than thresh.
+
+        Args:
+            plane_equ (torch.Tensor): Plane to be checked.
+            points (torch.Tensor): Points to be checked.
+
+        Returns:
+            Tuple: Flag of result.
+        """
+        return (points[:, 2] +
+                plane_equ[-1]).sum() / 4.0 < self.train_cfg['lower_thresh']
+
+    def point2line_dist(self, points, pts_a, pts_b):
+        """Calculate the distance from point to line.
+
+        Args:
+            points (torch.Tensor): Points of input.
+            pts_a (torch.Tensor): Point on the specific line.
+            pts_b (torch.Tensor): Point on the specific line.
+
+        Returns:
+            torch.Tensor: Distance between each point to line.
+        """
+        line_a2b = pts_b - pts_a
+        line_a2pts = points - pts_a
+        length = (line_a2pts * line_a2b.view(1, 3)).sum(1) / \
+            line_a2b.norm()
+        dist = (line_a2pts.norm(dim=1)**2 - length**2).sqrt()
+
+        return dist
+
+    def match_point2line(self, points, corners, with_yaw, mode='bottom'):
+        """Match points to corresponding line.
+
+        Args:
+            points (torch.Tensor): Points of input.
+            corners (torch.Tensor): Eight corners of a bounding box.
+            with_yaw (Bool): Whether the boundind box is with rotation.
+            mode (str, optional): Specify which line should be matched,
+                available mode are ('bottom', 'top', 'left', 'right').
+                Defaults to 'bottom'.
+
+        Returns:
+            Tuple: Flag of matching correspondence.
+        """
+        if with_yaw:
+            corners_pair = {
+                'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]],
+                'top': [[1, 2], [5, 6], [1, 5], [2, 6]],
+                'left': [[0, 1], [3, 2], [0, 1], [3, 2]],
+                'right': [[4, 5], [7, 6], [4, 5], [7, 6]]
+            }
+            selected_list = []
+            for pair_index in corners_pair[mode]:
+                selected = self.point2line_dist(
+                    points, corners[pair_index[0]], corners[pair_index[1]]) \
+                    < self.train_cfg['line_thresh']
+                selected_list.append(selected)
+        else:
+            xmin, ymin, _ = corners.min(0)[0]
+            xmax, ymax, _ = corners.max(0)[0]
+            sel1 = torch.abs(points[:, 0] -
+                             xmin) < self.train_cfg['line_thresh']
+            sel2 = torch.abs(points[:, 0] -
+                             xmax) < self.train_cfg['line_thresh']
+            sel3 = torch.abs(points[:, 1] -
+                             ymin) < self.train_cfg['line_thresh']
+            sel4 = torch.abs(points[:, 1] -
+                             ymax) < self.train_cfg['line_thresh']
+            selected_list = [sel1, sel2, sel3, sel4]
+        return selected_list
+
+    def match_point2plane(self, plane, points):
+        """Match points to plane.
+
+        Args:
+            plane (torch.Tensor): Equation of the plane.
+            points (torch.Tensor): Points of input.
+
+        Returns:
+            Tuple: Distance of each point to the plane and
+                flag of matching correspondence.
+        """
+        point2plane_dist = torch.abs((points * plane[:3]).sum(dim=1) +
+                                     plane[-1])
+        min_dist = point2plane_dist.min()
+        selected = torch.abs(point2plane_dist -
+                             min_dist) < self.train_cfg['dist_thresh']
+        return point2plane_dist, selected
+
+    def compute_primitive_loss(self, primitive_center, primitive_semantic,
+                               semantic_scores, num_proposal,
+                               gt_primitive_center, gt_primitive_semantic,
+                               gt_sem_cls_label, gt_primitive_mask):
+        """Compute loss of primitive module.
+
+        Args:
+            primitive_center (torch.Tensor): Pridictions of primitive center.
+            primitive_semantic (torch.Tensor): Pridictions of primitive
+                semantic.
+            semantic_scores (torch.Tensor): Pridictions of primitive
+                semantic scores.
+            num_proposal (int): The number of primitive proposal.
+            gt_primitive_center (torch.Tensor): Ground truth of
+                primitive center.
+            gt_votes_sem (torch.Tensor): Ground truth of primitive semantic.
+            gt_sem_cls_label (torch.Tensor): Ground truth of primitive
+                semantic class.
+            gt_primitive_mask (torch.Tensor): Ground truth of primitive mask.
+
+        Returns:
+            Tuple: Loss of primitive module.
+        """
+        batch_size = primitive_center.shape[0]
+        vote_xyz_reshape = primitive_center.view(batch_size * num_proposal, -1,
+                                                 3)
+
+        center_loss = self.center_loss(
+            vote_xyz_reshape,
+            gt_primitive_center,
+            dst_weight=gt_primitive_mask.view(batch_size * num_proposal, 1))[1]
+
+        if self.primitive_mode != 'line':
+            size_xyz_reshape = primitive_semantic.view(
+                batch_size * num_proposal, -1, self.num_dims).contiguous()
+            size_loss = self.semantic_reg_loss(
+                size_xyz_reshape,
+                gt_primitive_semantic,
+                dst_weight=gt_primitive_mask.view(batch_size * num_proposal,
+                                                  1))[1]
+        else:
+            size_loss = center_loss.new_tensor(0.0)
+
+        # Semantic cls loss
+        sem_cls_loss = self.semantic_cls_loss(
+            semantic_scores, gt_sem_cls_label, weight=gt_primitive_mask)
+
+        return center_loss, size_loss, sem_cls_loss
+
+    def get_primitive_center(self, pred_flag, center):
+        """Generate primitive center from predictions.
+
+        Args:
+            pred_flag (torch.Tensor): Scores of primitive center.
+            center (torch.Tensor): Pridictions of primitive center.
+
+        Returns:
+            Tuple: Primitive center and the prediction indices.
+        """
+        ind_normal = F.softmax(pred_flag, dim=1)
+        pred_indices = (ind_normal[:, 1, :] >
+                        self.surface_thresh).detach().float()
+        selected = (ind_normal[:, 1, :] <=
+                    self.surface_thresh).detach().float()
+        offset = torch.ones_like(center) * self.upper_thresh
+        center = center + offset * selected.unsqueeze(-1)
+        return center, pred_indices
+
+    def _assign_primitive_line_targets(self,
+                                       point_mask,
+                                       point_offset,
+                                       point_sem,
+                                       coords,
+                                       indices,
+                                       cls_label,
+                                       point2line_matching,
+                                       corners,
+                                       center_axises,
+                                       with_yaw,
+                                       mode='bottom'):
+        """Generate targets of line primitive.
+
+        Args:
+            point_mask (torch.Tensor): Tensor to store the ground
+                truth of mask.
+            point_offset (torch.Tensor): Tensor to store the ground
+                truth of offset.
+            point_sem (torch.Tensor): Tensor to store the ground
+                truth of semantic.
+            coords (torch.Tensor): The selected points.
+            indices (torch.Tensor): Indices of the selected points.
+            cls_label (int): Class label of the ground truth bounding box.
+            point2line_matching (torch.Tensor): Flag indicate that
+                matching line of each point.
+            corners (torch.Tensor): Corners of the ground truth bounding box.
+            center_axises (list[int]): Indicate in which axis the line center
+                should be refined.
+            with_yaw (Bool): Whether the boundind box is with rotation.
+            mode (str, optional): Specify which line should be matched,
+                available mode are ('bottom', 'top', 'left', 'right').
+                Defaults to 'bottom'.
+
+        Returns:
+            Tuple: Targets of the line primitive.
+        """
+        corners_pair = {
+            'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]],
+            'top': [[1, 2], [5, 6], [1, 5], [2, 6]],
+            'left': [[0, 1], [3, 2]],
+            'right': [[4, 5], [7, 6]]
+        }
+        corners_pair = corners_pair[mode]
+        assert len(corners_pair) == len(point2line_matching) == len(
+            center_axises)
+        for line_select, center_axis, pair_index in zip(
+                point2line_matching, center_axises, corners_pair):
+            if line_select.sum() > self.train_cfg['num_point_line']:
+                point_mask[indices[line_select]] = 1.0
+
+                if with_yaw:
+                    line_center = (corners[pair_index[0]] +
+                                   corners[pair_index[1]]) / 2
+                else:
+                    line_center = coords[line_select].mean(dim=0)
+                    line_center[center_axis] = corners[:, center_axis].mean()
+
+                point_offset[indices[line_select]] = \
+                    line_center - coords[line_select]
+                point_sem[indices[line_select]] = \
+                    point_sem.new_tensor([line_center[0], line_center[1],
+                                          line_center[2], cls_label])
+        return point_mask, point_offset, point_sem
+
+    def _assign_primitive_surface_targets(self,
+                                          point_mask,
+                                          point_offset,
+                                          point_sem,
+                                          coords,
+                                          indices,
+                                          cls_label,
+                                          corners,
+                                          with_yaw,
+                                          mode='bottom'):
+        """Generate targets for primitive z and primitive xy.
+
+        Args:
+            point_mask (torch.Tensor): Tensor to store the ground
+                truth of mask.
+            point_offset (torch.Tensor): Tensor to store the ground
+                truth of offset.
+            point_sem (torch.Tensor): Tensor to store the ground
+                truth of semantic.
+            coords (torch.Tensor): The selected points.
+            indices (torch.Tensor): Indices of the selected points.
+            cls_label (int): Class label of the ground truth bounding box.
+            corners (torch.Tensor): Corners of the ground truth bounding box.
+            with_yaw (Bool): Whether the boundind box is with rotation.
+            mode (str, optional): Specify which line should be matched,
+                available mode are ('bottom', 'top', 'left', 'right',
+                'front', 'back').
+                Defaults to 'bottom'.
+
+        Returns:
+            Tuple: Targets of the center primitive.
+        """
+        point_mask[indices] = 1.0
+        corners_pair = {
+            'bottom': [0, 7],
+            'top': [1, 6],
+            'left': [0, 1],
+            'right': [4, 5],
+            'front': [0, 1],
+            'back': [3, 2]
+        }
+        pair_index = corners_pair[mode]
+        if self.primitive_mode == 'z':
+            if with_yaw:
+                center = (corners[pair_index[0]] +
+                          corners[pair_index[1]]) / 2.0
+                center[2] = coords[:, 2].mean()
+                point_sem[indices] = point_sem.new_tensor([
+                    center[0], center[1],
+                    center[2], (corners[4] - corners[0]).norm(),
+                    (corners[3] - corners[0]).norm(), cls_label
+                ])
+            else:
+                center = point_mask.new_tensor([
+                    corners[:, 0].mean(), corners[:, 1].mean(),
+                    coords[:, 2].mean()
+                ])
+                point_sem[indices] = point_sem.new_tensor([
+                    center[0], center[1], center[2],
+                    corners[:, 0].max() - corners[:, 0].min(),
+                    corners[:, 1].max() - corners[:, 1].min(), cls_label
+                ])
+        elif self.primitive_mode == 'xy':
+            if with_yaw:
+                center = coords.mean(0)
+                center[2] = (corners[pair_index[0], 2] +
+                             corners[pair_index[1], 2]) / 2.0
+                point_sem[indices] = point_sem.new_tensor([
+                    center[0], center[1], center[2],
+                    corners[pair_index[1], 2] - corners[pair_index[0], 2],
+                    cls_label
+                ])
+            else:
+                center = point_mask.new_tensor([
+                    coords[:, 0].mean(), coords[:, 1].mean(),
+                    corners[:, 2].mean()
+                ])
+                point_sem[indices] = point_sem.new_tensor([
+                    center[0], center[1], center[2],
+                    corners[:, 2].max() - corners[:, 2].min(), cls_label
+                ])
+        point_offset[indices] = center - coords
+        return point_mask, point_offset, point_sem
+
+    def _get_plane_fomulation(self, vector1, vector2, point):
+        """Compute the equation of the plane.
+
+        Args:
+            vector1 (torch.Tensor): Parallel vector of the plane.
+            vector2 (torch.Tensor): Parallel vector of the plane.
+            point (torch.Tensor): Point on the plane.
+
+        Returns:
+            torch.Tensor: Equation of the plane.
+        """
+        surface_norm = torch.cross(vector1, vector2)
+        surface_dis = -torch.dot(surface_norm, point)
+        plane = point.new_tensor(
+            [surface_norm[0], surface_norm[1], surface_norm[2], surface_dis])
+        return plane
diff --git a/mmdet3d/models/roi_heads/part_aggregation_roi_head.py b/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
index a3e49ea..7b66bac 100644
--- a/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
+++ b/mmdet3d/models/roi_heads/part_aggregation_roi_head.py
@@ -1,325 +1,325 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-from torch.nn import functional as F
-
-from mmdet3d.core import AssignResult
-from mmdet3d.core.bbox import bbox3d2result, bbox3d2roi
-from mmdet.core import build_assigner, build_sampler
-from ..builder import HEADS, build_head, build_roi_extractor
-from .base_3droi_head import Base3DRoIHead
-
-
-@HEADS.register_module()
-class PartAggregationROIHead(Base3DRoIHead):
-    """Part aggregation roi head for PartA2.
-
-    Args:
-        semantic_head (ConfigDict): Config of semantic head.
-        num_classes (int): The number of classes.
-        seg_roi_extractor (ConfigDict): Config of seg_roi_extractor.
-        part_roi_extractor (ConfigDict): Config of part_roi_extractor.
-        bbox_head (ConfigDict): Config of bbox_head.
-        train_cfg (ConfigDict): Training config.
-        test_cfg (ConfigDict): Testing config.
-    """
-
-    def __init__(self,
-                 semantic_head,
-                 num_classes=3,
-                 seg_roi_extractor=None,
-                 part_roi_extractor=None,
-                 bbox_head=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(PartAggregationROIHead, self).__init__(
-            bbox_head=bbox_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            init_cfg=init_cfg)
-        self.num_classes = num_classes
-        assert semantic_head is not None
-        self.semantic_head = build_head(semantic_head)
-
-        if seg_roi_extractor is not None:
-            self.seg_roi_extractor = build_roi_extractor(seg_roi_extractor)
-        if part_roi_extractor is not None:
-            self.part_roi_extractor = build_roi_extractor(part_roi_extractor)
-
-        self.init_assigner_sampler()
-
-        assert not (init_cfg and pretrained), \
-            'init_cfg and pretrained cannot be setting at the same time'
-        if isinstance(pretrained, str):
-            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
-                          'please use "init_cfg" instead')
-            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-
-    def init_mask_head(self):
-        """Initialize mask head, skip since ``PartAggregationROIHead`` does not
-        have one."""
-        pass
-
-    def init_bbox_head(self, bbox_head):
-        """Initialize box head."""
-        self.bbox_head = build_head(bbox_head)
-
-    def init_assigner_sampler(self):
-        """Initialize assigner and sampler."""
-        self.bbox_assigner = None
-        self.bbox_sampler = None
-        if self.train_cfg:
-            if isinstance(self.train_cfg.assigner, dict):
-                self.bbox_assigner = build_assigner(self.train_cfg.assigner)
-            elif isinstance(self.train_cfg.assigner, list):
-                self.bbox_assigner = [
-                    build_assigner(res) for res in self.train_cfg.assigner
-                ]
-            self.bbox_sampler = build_sampler(self.train_cfg.sampler)
-
-    @property
-    def with_semantic(self):
-        """bool: whether the head has semantic branch"""
-        return hasattr(self,
-                       'semantic_head') and self.semantic_head is not None
-
-    def forward_train(self, feats_dict, voxels_dict, img_metas, proposal_list,
-                      gt_bboxes_3d, gt_labels_3d):
-        """Training forward function of PartAggregationROIHead.
-
-        Args:
-            feats_dict (dict): Contains features from the first stage.
-            voxels_dict (dict): Contains information of voxels.
-            img_metas (list[dict]): Meta info of each image.
-            proposal_list (list[dict]): Proposal information from rpn.
-                The dictionary should contain the following keys:
-
-                - boxes_3d (:obj:`BaseInstance3DBoxes`): Proposal bboxes
-                - labels_3d (torch.Tensor): Labels of proposals
-                - cls_preds (torch.Tensor): Original scores of proposals
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]):
-                GT bboxes of each sample. The bboxes are encapsulated
-                by 3D box structures.
-            gt_labels_3d (list[LongTensor]): GT labels of each sample.
-
-        Returns:
-            dict: losses from each head.
-
-                - loss_semantic (torch.Tensor): loss of semantic head
-                - loss_bbox (torch.Tensor): loss of bboxes
-        """
-        losses = dict()
-        if self.with_semantic:
-            semantic_results = self._semantic_forward_train(
-                feats_dict['seg_features'], voxels_dict, gt_bboxes_3d,
-                gt_labels_3d)
-            losses.update(semantic_results['loss_semantic'])
-
-        sample_results = self._assign_and_sample(proposal_list, gt_bboxes_3d,
-                                                 gt_labels_3d)
-        if self.with_bbox:
-            bbox_results = self._bbox_forward_train(
-                feats_dict['seg_features'], semantic_results['part_feats'],
-                voxels_dict, sample_results)
-            losses.update(bbox_results['loss_bbox'])
-
-        return losses
-
-    def simple_test(self, feats_dict, voxels_dict, img_metas, proposal_list,
-                    **kwargs):
-        """Simple testing forward function of PartAggregationROIHead.
-
-        Note:
-            This function assumes that the batch size is 1
-
-        Args:
-            feats_dict (dict): Contains features from the first stage.
-            voxels_dict (dict): Contains information of voxels.
-            img_metas (list[dict]): Meta info of each image.
-            proposal_list (list[dict]): Proposal information from rpn.
-
-        Returns:
-            dict: Bbox results of one frame.
-        """
-        assert self.with_bbox, 'Bbox head must be implemented.'
-        assert self.with_semantic
-
-        semantic_results = self.semantic_head(feats_dict['seg_features'])
-
-        rois = bbox3d2roi([res['boxes_3d'].tensor for res in proposal_list])
-        labels_3d = [res['labels_3d'] for res in proposal_list]
-        cls_preds = [res['cls_preds'] for res in proposal_list]
-        bbox_results = self._bbox_forward(feats_dict['seg_features'],
-                                          semantic_results['part_feats'],
-                                          voxels_dict, rois)
-
-        bbox_list = self.bbox_head.get_bboxes(
-            rois,
-            bbox_results['cls_score'],
-            bbox_results['bbox_pred'],
-            labels_3d,
-            cls_preds,
-            img_metas,
-            cfg=self.test_cfg)
-
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def _bbox_forward_train(self, seg_feats, part_feats, voxels_dict,
-                            sampling_results):
-        """Forward training function of roi_extractor and bbox_head.
-
-        Args:
-            seg_feats (torch.Tensor): Point-wise semantic features.
-            part_feats (torch.Tensor): Point-wise part prediction features.
-            voxels_dict (dict): Contains information of voxels.
-            sampling_results (:obj:`SamplingResult`): Sampled results used
-                for training.
-
-        Returns:
-            dict: Forward results including losses and predictions.
-        """
-        rois = bbox3d2roi([res.bboxes for res in sampling_results])
-        bbox_results = self._bbox_forward(seg_feats, part_feats, voxels_dict,
-                                          rois)
-
-        bbox_targets = self.bbox_head.get_targets(sampling_results,
-                                                  self.train_cfg)
-        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
-                                        bbox_results['bbox_pred'], rois,
-                                        *bbox_targets)
-
-        bbox_results.update(loss_bbox=loss_bbox)
-        return bbox_results
-
-    def _bbox_forward(self, seg_feats, part_feats, voxels_dict, rois):
-        """Forward function of roi_extractor and bbox_head used in both
-        training and testing.
-
-        Args:
-            seg_feats (torch.Tensor): Point-wise semantic features.
-            part_feats (torch.Tensor): Point-wise part prediction features.
-            voxels_dict (dict): Contains information of voxels.
-            rois (Tensor): Roi boxes.
-
-        Returns:
-            dict: Contains predictions of bbox_head and
-                features of roi_extractor.
-        """
-        pooled_seg_feats = self.seg_roi_extractor(seg_feats,
-                                                  voxels_dict['voxel_centers'],
-                                                  voxels_dict['coors'][..., 0],
-                                                  rois)
-        pooled_part_feats = self.part_roi_extractor(
-            part_feats, voxels_dict['voxel_centers'],
-            voxels_dict['coors'][..., 0], rois)
-        cls_score, bbox_pred = self.bbox_head(pooled_seg_feats,
-                                              pooled_part_feats)
-
-        bbox_results = dict(
-            cls_score=cls_score,
-            bbox_pred=bbox_pred,
-            pooled_seg_feats=pooled_seg_feats,
-            pooled_part_feats=pooled_part_feats)
-        return bbox_results
-
-    def _assign_and_sample(self, proposal_list, gt_bboxes_3d, gt_labels_3d):
-        """Assign and sample proposals for training.
-
-        Args:
-            proposal_list (list[dict]): Proposals produced by RPN.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes.
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels
-
-        Returns:
-            list[:obj:`SamplingResult`]: Sampled results of each training
-                sample.
-        """
-        sampling_results = []
-        # bbox assign
-        for batch_idx in range(len(proposal_list)):
-            cur_proposal_list = proposal_list[batch_idx]
-            cur_boxes = cur_proposal_list['boxes_3d']
-            cur_labels_3d = cur_proposal_list['labels_3d']
-            cur_gt_bboxes = gt_bboxes_3d[batch_idx].to(cur_boxes.device)
-            cur_gt_labels = gt_labels_3d[batch_idx]
-
-            batch_num_gts = 0
-            # 0 is bg
-            batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0)
-            batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes))
-            # -1 is bg
-            batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1)
-
-            # each class may have its own assigner
-            if isinstance(self.bbox_assigner, list):
-                for i, assigner in enumerate(self.bbox_assigner):
-                    gt_per_cls = (cur_gt_labels == i)
-                    pred_per_cls = (cur_labels_3d == i)
-                    cur_assign_res = assigner.assign(
-                        cur_boxes.tensor[pred_per_cls],
-                        cur_gt_bboxes.tensor[gt_per_cls],
-                        gt_labels=cur_gt_labels[gt_per_cls])
-                    # gather assign_results in different class into one result
-                    batch_num_gts += cur_assign_res.num_gts
-                    # gt inds (1-based)
-                    gt_inds_arange_pad = gt_per_cls.nonzero(
-                        as_tuple=False).view(-1) + 1
-                    # pad 0 for indice unassigned
-                    gt_inds_arange_pad = F.pad(
-                        gt_inds_arange_pad, (1, 0), mode='constant', value=0)
-                    # pad -1 for indice ignore
-                    gt_inds_arange_pad = F.pad(
-                        gt_inds_arange_pad, (1, 0), mode='constant', value=-1)
-                    # convert to 0~gt_num+2 for indices
-                    gt_inds_arange_pad += 1
-                    # now 0 is bg, >1 is fg in batch_gt_indis
-                    batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[
-                        cur_assign_res.gt_inds + 1] - 1
-                    batch_max_overlaps[
-                        pred_per_cls] = cur_assign_res.max_overlaps
-                    batch_gt_labels[pred_per_cls] = cur_assign_res.labels
-
-                assign_result = AssignResult(batch_num_gts, batch_gt_indis,
-                                             batch_max_overlaps,
-                                             batch_gt_labels)
-            else:  # for single class
-                assign_result = self.bbox_assigner.assign(
-                    cur_boxes.tensor,
-                    cur_gt_bboxes.tensor,
-                    gt_labels=cur_gt_labels)
-            # sample boxes
-            sampling_result = self.bbox_sampler.sample(assign_result,
-                                                       cur_boxes.tensor,
-                                                       cur_gt_bboxes.tensor,
-                                                       cur_gt_labels)
-            sampling_results.append(sampling_result)
-        return sampling_results
-
-    def _semantic_forward_train(self, x, voxels_dict, gt_bboxes_3d,
-                                gt_labels_3d):
-        """Train semantic head.
-
-        Args:
-            x (torch.Tensor): Point-wise semantic features for segmentation
-            voxels_dict (dict): Contains information of voxels.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes.
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels
-
-        Returns:
-            dict: Segmentation results including losses
-        """
-        semantic_results = self.semantic_head(x)
-        semantic_targets = self.semantic_head.get_targets(
-            voxels_dict, gt_bboxes_3d, gt_labels_3d)
-        loss_semantic = self.semantic_head.loss(semantic_results,
-                                                semantic_targets)
-        semantic_results.update(loss_semantic=loss_semantic)
-        return semantic_results
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from torch.nn import functional as F
+
+from mmdet3d.core import AssignResult
+from mmdet3d.core.bbox import bbox3d2result, bbox3d2roi
+from mmdet.core import build_assigner, build_sampler
+from ..builder import HEADS, build_head, build_roi_extractor
+from .base_3droi_head import Base3DRoIHead
+
+
+@HEADS.register_module()
+class PartAggregationROIHead(Base3DRoIHead):
+    """Part aggregation roi head for PartA2.
+
+    Args:
+        semantic_head (ConfigDict): Config of semantic head.
+        num_classes (int): The number of classes.
+        seg_roi_extractor (ConfigDict): Config of seg_roi_extractor.
+        part_roi_extractor (ConfigDict): Config of part_roi_extractor.
+        bbox_head (ConfigDict): Config of bbox_head.
+        train_cfg (ConfigDict): Training config.
+        test_cfg (ConfigDict): Testing config.
+    """
+
+    def __init__(self,
+                 semantic_head,
+                 num_classes=3,
+                 seg_roi_extractor=None,
+                 part_roi_extractor=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(PartAggregationROIHead, self).__init__(
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            init_cfg=init_cfg)
+        self.num_classes = num_classes
+        assert semantic_head is not None
+        self.semantic_head = build_head(semantic_head)
+
+        if seg_roi_extractor is not None:
+            self.seg_roi_extractor = build_roi_extractor(seg_roi_extractor)
+        if part_roi_extractor is not None:
+            self.part_roi_extractor = build_roi_extractor(part_roi_extractor)
+
+        self.init_assigner_sampler()
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+    def init_mask_head(self):
+        """Initialize mask head, skip since ``PartAggregationROIHead`` does not
+        have one."""
+        pass
+
+    def init_bbox_head(self, bbox_head):
+        """Initialize box head."""
+        self.bbox_head = build_head(bbox_head)
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            if isinstance(self.train_cfg.assigner, dict):
+                self.bbox_assigner = build_assigner(self.train_cfg.assigner)
+            elif isinstance(self.train_cfg.assigner, list):
+                self.bbox_assigner = [
+                    build_assigner(res) for res in self.train_cfg.assigner
+                ]
+            self.bbox_sampler = build_sampler(self.train_cfg.sampler)
+
+    @property
+    def with_semantic(self):
+        """bool: whether the head has semantic branch"""
+        return hasattr(self,
+                       'semantic_head') and self.semantic_head is not None
+
+    def forward_train(self, feats_dict, voxels_dict, img_metas, proposal_list,
+                      gt_bboxes_3d, gt_labels_3d):
+        """Training forward function of PartAggregationROIHead.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            voxels_dict (dict): Contains information of voxels.
+            img_metas (list[dict]): Meta info of each image.
+            proposal_list (list[dict]): Proposal information from rpn.
+                The dictionary should contain the following keys:
+
+                - boxes_3d (:obj:`BaseInstance3DBoxes`): Proposal bboxes
+                - labels_3d (torch.Tensor): Labels of proposals
+                - cls_preds (torch.Tensor): Original scores of proposals
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]):
+                GT bboxes of each sample. The bboxes are encapsulated
+                by 3D box structures.
+            gt_labels_3d (list[LongTensor]): GT labels of each sample.
+
+        Returns:
+            dict: losses from each head.
+
+                - loss_semantic (torch.Tensor): loss of semantic head
+                - loss_bbox (torch.Tensor): loss of bboxes
+        """
+        losses = dict()
+        if self.with_semantic:
+            semantic_results = self._semantic_forward_train(
+                feats_dict['seg_features'], voxels_dict, gt_bboxes_3d,
+                gt_labels_3d)
+            losses.update(semantic_results['loss_semantic'])
+
+        sample_results = self._assign_and_sample(proposal_list, gt_bboxes_3d,
+                                                 gt_labels_3d)
+        if self.with_bbox:
+            bbox_results = self._bbox_forward_train(
+                feats_dict['seg_features'], semantic_results['part_feats'],
+                voxels_dict, sample_results)
+            losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def simple_test(self, feats_dict, voxels_dict, img_metas, proposal_list,
+                    **kwargs):
+        """Simple testing forward function of PartAggregationROIHead.
+
+        Note:
+            This function assumes that the batch size is 1
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            voxels_dict (dict): Contains information of voxels.
+            img_metas (list[dict]): Meta info of each image.
+            proposal_list (list[dict]): Proposal information from rpn.
+
+        Returns:
+            dict: Bbox results of one frame.
+        """
+        assert self.with_bbox, 'Bbox head must be implemented.'
+        assert self.with_semantic
+
+        semantic_results = self.semantic_head(feats_dict['seg_features'])
+
+        rois = bbox3d2roi([res['boxes_3d'].tensor for res in proposal_list])
+        labels_3d = [res['labels_3d'] for res in proposal_list]
+        cls_preds = [res['cls_preds'] for res in proposal_list]
+        bbox_results = self._bbox_forward(feats_dict['seg_features'],
+                                          semantic_results['part_feats'],
+                                          voxels_dict, rois)
+
+        bbox_list = self.bbox_head.get_bboxes(
+            rois,
+            bbox_results['cls_score'],
+            bbox_results['bbox_pred'],
+            labels_3d,
+            cls_preds,
+            img_metas,
+            cfg=self.test_cfg)
+
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def _bbox_forward_train(self, seg_feats, part_feats, voxels_dict,
+                            sampling_results):
+        """Forward training function of roi_extractor and bbox_head.
+
+        Args:
+            seg_feats (torch.Tensor): Point-wise semantic features.
+            part_feats (torch.Tensor): Point-wise part prediction features.
+            voxels_dict (dict): Contains information of voxels.
+            sampling_results (:obj:`SamplingResult`): Sampled results used
+                for training.
+
+        Returns:
+            dict: Forward results including losses and predictions.
+        """
+        rois = bbox3d2roi([res.bboxes for res in sampling_results])
+        bbox_results = self._bbox_forward(seg_feats, part_feats, voxels_dict,
+                                          rois)
+
+        bbox_targets = self.bbox_head.get_targets(sampling_results,
+                                                  self.train_cfg)
+        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
+                                        bbox_results['bbox_pred'], rois,
+                                        *bbox_targets)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def _bbox_forward(self, seg_feats, part_feats, voxels_dict, rois):
+        """Forward function of roi_extractor and bbox_head used in both
+        training and testing.
+
+        Args:
+            seg_feats (torch.Tensor): Point-wise semantic features.
+            part_feats (torch.Tensor): Point-wise part prediction features.
+            voxels_dict (dict): Contains information of voxels.
+            rois (Tensor): Roi boxes.
+
+        Returns:
+            dict: Contains predictions of bbox_head and
+                features of roi_extractor.
+        """
+        pooled_seg_feats = self.seg_roi_extractor(seg_feats,
+                                                  voxels_dict['voxel_centers'],
+                                                  voxels_dict['coors'][..., 0],
+                                                  rois)
+        pooled_part_feats = self.part_roi_extractor(
+            part_feats, voxels_dict['voxel_centers'],
+            voxels_dict['coors'][..., 0], rois)
+        cls_score, bbox_pred = self.bbox_head(pooled_seg_feats,
+                                              pooled_part_feats)
+
+        bbox_results = dict(
+            cls_score=cls_score,
+            bbox_pred=bbox_pred,
+            pooled_seg_feats=pooled_seg_feats,
+            pooled_part_feats=pooled_part_feats)
+        return bbox_results
+
+    def _assign_and_sample(self, proposal_list, gt_bboxes_3d, gt_labels_3d):
+        """Assign and sample proposals for training.
+
+        Args:
+            proposal_list (list[dict]): Proposals produced by RPN.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels
+
+        Returns:
+            list[:obj:`SamplingResult`]: Sampled results of each training
+                sample.
+        """
+        sampling_results = []
+        # bbox assign
+        for batch_idx in range(len(proposal_list)):
+            cur_proposal_list = proposal_list[batch_idx]
+            cur_boxes = cur_proposal_list['boxes_3d']
+            cur_labels_3d = cur_proposal_list['labels_3d']
+            cur_gt_bboxes = gt_bboxes_3d[batch_idx].to(cur_boxes.device)
+            cur_gt_labels = gt_labels_3d[batch_idx]
+
+            batch_num_gts = 0
+            # 0 is bg
+            batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0)
+            batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes))
+            # -1 is bg
+            batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1)
+
+            # each class may have its own assigner
+            if isinstance(self.bbox_assigner, list):
+                for i, assigner in enumerate(self.bbox_assigner):
+                    gt_per_cls = (cur_gt_labels == i)
+                    pred_per_cls = (cur_labels_3d == i)
+                    cur_assign_res = assigner.assign(
+                        cur_boxes.tensor[pred_per_cls],
+                        cur_gt_bboxes.tensor[gt_per_cls],
+                        gt_labels=cur_gt_labels[gt_per_cls])
+                    # gather assign_results in different class into one result
+                    batch_num_gts += cur_assign_res.num_gts
+                    # gt inds (1-based)
+                    gt_inds_arange_pad = gt_per_cls.nonzero(
+                        as_tuple=False).view(-1) + 1
+                    # pad 0 for indice unassigned
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=0)
+                    # pad -1 for indice ignore
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=-1)
+                    # convert to 0~gt_num+2 for indices
+                    gt_inds_arange_pad += 1
+                    # now 0 is bg, >1 is fg in batch_gt_indis
+                    batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[
+                        cur_assign_res.gt_inds + 1] - 1
+                    batch_max_overlaps[
+                        pred_per_cls] = cur_assign_res.max_overlaps
+                    batch_gt_labels[pred_per_cls] = cur_assign_res.labels
+
+                assign_result = AssignResult(batch_num_gts, batch_gt_indis,
+                                             batch_max_overlaps,
+                                             batch_gt_labels)
+            else:  # for single class
+                assign_result = self.bbox_assigner.assign(
+                    cur_boxes.tensor,
+                    cur_gt_bboxes.tensor,
+                    gt_labels=cur_gt_labels)
+            # sample boxes
+            sampling_result = self.bbox_sampler.sample(assign_result,
+                                                       cur_boxes.tensor,
+                                                       cur_gt_bboxes.tensor,
+                                                       cur_gt_labels)
+            sampling_results.append(sampling_result)
+        return sampling_results
+
+    def _semantic_forward_train(self, x, voxels_dict, gt_bboxes_3d,
+                                gt_labels_3d):
+        """Train semantic head.
+
+        Args:
+            x (torch.Tensor): Point-wise semantic features for segmentation
+            voxels_dict (dict): Contains information of voxels.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels
+
+        Returns:
+            dict: Segmentation results including losses
+        """
+        semantic_results = self.semantic_head(x)
+        semantic_targets = self.semantic_head.get_targets(
+            voxels_dict, gt_bboxes_3d, gt_labels_3d)
+        loss_semantic = self.semantic_head.loss(semantic_results,
+                                                semantic_targets)
+        semantic_results.update(loss_semantic=loss_semantic)
+        return semantic_results
diff --git a/mmdet3d/models/roi_heads/point_rcnn_roi_head.py b/mmdet3d/models/roi_heads/point_rcnn_roi_head.py
index acf7c16..5e13c16 100644
--- a/mmdet3d/models/roi_heads/point_rcnn_roi_head.py
+++ b/mmdet3d/models/roi_heads/point_rcnn_roi_head.py
@@ -1,286 +1,286 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch.nn import functional as F
-
-from mmdet3d.core import AssignResult
-from mmdet3d.core.bbox import bbox3d2result, bbox3d2roi
-from mmdet.core import build_assigner, build_sampler
-from ..builder import HEADS, build_head, build_roi_extractor
-from .base_3droi_head import Base3DRoIHead
-
-
-@HEADS.register_module()
-class PointRCNNRoIHead(Base3DRoIHead):
-    """RoI head for PointRCNN.
-
-    Args:
-        bbox_head (dict): Config of bbox_head.
-        point_roi_extractor (dict): Config of RoI extractor.
-        train_cfg (dict): Train configs.
-        test_cfg (dict): Test configs.
-        depth_normalizer (float, optional): Normalize depth feature.
-            Defaults to 70.0.
-        init_cfg (dict, optional): Config of initialization. Defaults to None.
-    """
-
-    def __init__(self,
-                 bbox_head,
-                 point_roi_extractor,
-                 train_cfg,
-                 test_cfg,
-                 depth_normalizer=70.0,
-                 pretrained=None,
-                 init_cfg=None):
-        super(PointRCNNRoIHead, self).__init__(
-            bbox_head=bbox_head,
-            train_cfg=train_cfg,
-            test_cfg=test_cfg,
-            pretrained=pretrained,
-            init_cfg=init_cfg)
-        self.depth_normalizer = depth_normalizer
-
-        if point_roi_extractor is not None:
-            self.point_roi_extractor = build_roi_extractor(point_roi_extractor)
-
-        self.init_assigner_sampler()
-
-    def init_bbox_head(self, bbox_head):
-        """Initialize box head.
-
-        Args:
-            bbox_head (dict): Config dict of RoI Head.
-        """
-        self.bbox_head = build_head(bbox_head)
-
-    def init_mask_head(self):
-        """Initialize maek head."""
-        pass
-
-    def init_assigner_sampler(self):
-        """Initialize assigner and sampler."""
-        self.bbox_assigner = None
-        self.bbox_sampler = None
-        if self.train_cfg:
-            if isinstance(self.train_cfg.assigner, dict):
-                self.bbox_assigner = build_assigner(self.train_cfg.assigner)
-            elif isinstance(self.train_cfg.assigner, list):
-                self.bbox_assigner = [
-                    build_assigner(res) for res in self.train_cfg.assigner
-                ]
-            self.bbox_sampler = build_sampler(self.train_cfg.sampler)
-
-    def forward_train(self, feats_dict, input_metas, proposal_list,
-                      gt_bboxes_3d, gt_labels_3d):
-        """Training forward function of PointRCNNRoIHead.
-
-        Args:
-            feats_dict (dict): Contains features from the first stage.
-            imput_metas (list[dict]): Meta info of each input.
-            proposal_list (list[dict]): Proposal information from rpn.
-                The dictionary should contain the following keys:
-
-                - boxes_3d (:obj:`BaseInstance3DBoxes`): Proposal bboxes
-                - labels_3d (torch.Tensor): Labels of proposals
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]):
-                GT bboxes of each sample. The bboxes are encapsulated
-                by 3D box structures.
-            gt_labels_3d (list[LongTensor]): GT labels of each sample.
-
-        Returns:
-            dict: Losses from RoI RCNN head.
-                - loss_bbox (torch.Tensor): Loss of bboxes
-        """
-        features = feats_dict['features']
-        points = feats_dict['points']
-        point_cls_preds = feats_dict['points_cls_preds']
-        sem_scores = point_cls_preds.sigmoid()
-        point_scores = sem_scores.max(-1)[0]
-
-        sample_results = self._assign_and_sample(proposal_list, gt_bboxes_3d,
-                                                 gt_labels_3d)
-
-        # concat the depth, semantic features and backbone features
-        features = features.transpose(1, 2).contiguous()
-        point_depths = points.norm(dim=2) / self.depth_normalizer - 0.5
-        features_list = [
-            point_scores.unsqueeze(2),
-            point_depths.unsqueeze(2), features
-        ]
-        features = torch.cat(features_list, dim=2)
-
-        bbox_results = self._bbox_forward_train(features, points,
-                                                sample_results)
-        losses = dict()
-        losses.update(bbox_results['loss_bbox'])
-
-        return losses
-
-    def simple_test(self, feats_dict, img_metas, proposal_list, **kwargs):
-        """Simple testing forward function of PointRCNNRoIHead.
-
-        Note:
-            This function assumes that the batch size is 1
-
-        Args:
-            feats_dict (dict): Contains features from the first stage.
-            img_metas (list[dict]): Meta info of each image.
-            proposal_list (list[dict]): Proposal information from rpn.
-
-        Returns:
-            dict: Bbox results of one frame.
-        """
-        rois = bbox3d2roi([res['boxes_3d'].tensor for res in proposal_list])
-        labels_3d = [res['labels_3d'] for res in proposal_list]
-
-        features = feats_dict['features']
-        points = feats_dict['points']
-        point_cls_preds = feats_dict['points_cls_preds']
-        sem_scores = point_cls_preds.sigmoid()
-        point_scores = sem_scores.max(-1)[0]
-
-        features = features.transpose(1, 2).contiguous()
-        point_depths = points.norm(dim=2) / self.depth_normalizer - 0.5
-        features_list = [
-            point_scores.unsqueeze(2),
-            point_depths.unsqueeze(2), features
-        ]
-
-        features = torch.cat(features_list, dim=2)
-        batch_size = features.shape[0]
-        bbox_results = self._bbox_forward(features, points, batch_size, rois)
-        object_score = bbox_results['cls_score'].sigmoid()
-        bbox_list = self.bbox_head.get_bboxes(
-            rois,
-            object_score,
-            bbox_results['bbox_pred'],
-            labels_3d,
-            img_metas,
-            cfg=self.test_cfg)
-
-        bbox_results = [
-            bbox3d2result(bboxes, scores, labels)
-            for bboxes, scores, labels in bbox_list
-        ]
-        return bbox_results
-
-    def _bbox_forward_train(self, features, points, sampling_results):
-        """Forward training function of roi_extractor and bbox_head.
-
-        Args:
-            features (torch.Tensor): Backbone features with depth and \
-                semantic features.
-            points (torch.Tensor): Pointcloud.
-            sampling_results (:obj:`SamplingResult`): Sampled results used
-                for training.
-
-        Returns:
-            dict: Forward results including losses and predictions.
-        """
-        rois = bbox3d2roi([res.bboxes for res in sampling_results])
-        batch_size = features.shape[0]
-        bbox_results = self._bbox_forward(features, points, batch_size, rois)
-        bbox_targets = self.bbox_head.get_targets(sampling_results,
-                                                  self.train_cfg)
-
-        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
-                                        bbox_results['bbox_pred'], rois,
-                                        *bbox_targets)
-
-        bbox_results.update(loss_bbox=loss_bbox)
-        return bbox_results
-
-    def _bbox_forward(self, features, points, batch_size, rois):
-        """Forward function of roi_extractor and bbox_head used in both
-        training and testing.
-
-        Args:
-            features (torch.Tensor): Backbone features with depth and
-                semantic features.
-            points (torch.Tensor): Pointcloud.
-            batch_size (int): Batch size.
-            rois (torch.Tensor): RoI boxes.
-
-        Returns:
-            dict: Contains predictions of bbox_head and
-                features of roi_extractor.
-        """
-        pooled_point_feats = self.point_roi_extractor(features, points,
-                                                      batch_size, rois)
-
-        cls_score, bbox_pred = self.bbox_head(pooled_point_feats)
-        bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred)
-        return bbox_results
-
-    def _assign_and_sample(self, proposal_list, gt_bboxes_3d, gt_labels_3d):
-        """Assign and sample proposals for training.
-
-        Args:
-            proposal_list (list[dict]): Proposals produced by RPN.
-            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
-                boxes.
-            gt_labels_3d (list[torch.Tensor]): Ground truth labels
-
-        Returns:
-            list[:obj:`SamplingResult`]: Sampled results of each training
-                sample.
-        """
-        sampling_results = []
-        # bbox assign
-        for batch_idx in range(len(proposal_list)):
-            cur_proposal_list = proposal_list[batch_idx]
-            cur_boxes = cur_proposal_list['boxes_3d']
-            cur_labels_3d = cur_proposal_list['labels_3d']
-            cur_gt_bboxes = gt_bboxes_3d[batch_idx].to(cur_boxes.device)
-            cur_gt_labels = gt_labels_3d[batch_idx]
-            batch_num_gts = 0
-            # 0 is bg
-            batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0)
-            batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes))
-            # -1 is bg
-            batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1)
-
-            # each class may have its own assigner
-            if isinstance(self.bbox_assigner, list):
-                for i, assigner in enumerate(self.bbox_assigner):
-                    gt_per_cls = (cur_gt_labels == i)
-                    pred_per_cls = (cur_labels_3d == i)
-                    cur_assign_res = assigner.assign(
-                        cur_boxes.tensor[pred_per_cls],
-                        cur_gt_bboxes.tensor[gt_per_cls],
-                        gt_labels=cur_gt_labels[gt_per_cls])
-                    # gather assign_results in different class into one result
-                    batch_num_gts += cur_assign_res.num_gts
-                    # gt inds (1-based)
-                    gt_inds_arange_pad = gt_per_cls.nonzero(
-                        as_tuple=False).view(-1) + 1
-                    # pad 0 for indice unassigned
-                    gt_inds_arange_pad = F.pad(
-                        gt_inds_arange_pad, (1, 0), mode='constant', value=0)
-                    # pad -1 for indice ignore
-                    gt_inds_arange_pad = F.pad(
-                        gt_inds_arange_pad, (1, 0), mode='constant', value=-1)
-                    # convert to 0~gt_num+2 for indices
-                    gt_inds_arange_pad += 1
-                    # now 0 is bg, >1 is fg in batch_gt_indis
-                    batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[
-                        cur_assign_res.gt_inds + 1] - 1
-                    batch_max_overlaps[
-                        pred_per_cls] = cur_assign_res.max_overlaps
-                    batch_gt_labels[pred_per_cls] = cur_assign_res.labels
-
-                assign_result = AssignResult(batch_num_gts, batch_gt_indis,
-                                             batch_max_overlaps,
-                                             batch_gt_labels)
-            else:  # for single class
-                assign_result = self.bbox_assigner.assign(
-                    cur_boxes.tensor,
-                    cur_gt_bboxes.tensor,
-                    gt_labels=cur_gt_labels)
-
-            # sample boxes
-            sampling_result = self.bbox_sampler.sample(assign_result,
-                                                       cur_boxes.tensor,
-                                                       cur_gt_bboxes.tensor,
-                                                       cur_gt_labels)
-            sampling_results.append(sampling_result)
-        return sampling_results
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.nn import functional as F
+
+from mmdet3d.core import AssignResult
+from mmdet3d.core.bbox import bbox3d2result, bbox3d2roi
+from mmdet.core import build_assigner, build_sampler
+from ..builder import HEADS, build_head, build_roi_extractor
+from .base_3droi_head import Base3DRoIHead
+
+
+@HEADS.register_module()
+class PointRCNNRoIHead(Base3DRoIHead):
+    """RoI head for PointRCNN.
+
+    Args:
+        bbox_head (dict): Config of bbox_head.
+        point_roi_extractor (dict): Config of RoI extractor.
+        train_cfg (dict): Train configs.
+        test_cfg (dict): Test configs.
+        depth_normalizer (float, optional): Normalize depth feature.
+            Defaults to 70.0.
+        init_cfg (dict, optional): Config of initialization. Defaults to None.
+    """
+
+    def __init__(self,
+                 bbox_head,
+                 point_roi_extractor,
+                 train_cfg,
+                 test_cfg,
+                 depth_normalizer=70.0,
+                 pretrained=None,
+                 init_cfg=None):
+        super(PointRCNNRoIHead, self).__init__(
+            bbox_head=bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+        self.depth_normalizer = depth_normalizer
+
+        if point_roi_extractor is not None:
+            self.point_roi_extractor = build_roi_extractor(point_roi_extractor)
+
+        self.init_assigner_sampler()
+
+    def init_bbox_head(self, bbox_head):
+        """Initialize box head.
+
+        Args:
+            bbox_head (dict): Config dict of RoI Head.
+        """
+        self.bbox_head = build_head(bbox_head)
+
+    def init_mask_head(self):
+        """Initialize maek head."""
+        pass
+
+    def init_assigner_sampler(self):
+        """Initialize assigner and sampler."""
+        self.bbox_assigner = None
+        self.bbox_sampler = None
+        if self.train_cfg:
+            if isinstance(self.train_cfg.assigner, dict):
+                self.bbox_assigner = build_assigner(self.train_cfg.assigner)
+            elif isinstance(self.train_cfg.assigner, list):
+                self.bbox_assigner = [
+                    build_assigner(res) for res in self.train_cfg.assigner
+                ]
+            self.bbox_sampler = build_sampler(self.train_cfg.sampler)
+
+    def forward_train(self, feats_dict, input_metas, proposal_list,
+                      gt_bboxes_3d, gt_labels_3d):
+        """Training forward function of PointRCNNRoIHead.
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            imput_metas (list[dict]): Meta info of each input.
+            proposal_list (list[dict]): Proposal information from rpn.
+                The dictionary should contain the following keys:
+
+                - boxes_3d (:obj:`BaseInstance3DBoxes`): Proposal bboxes
+                - labels_3d (torch.Tensor): Labels of proposals
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]):
+                GT bboxes of each sample. The bboxes are encapsulated
+                by 3D box structures.
+            gt_labels_3d (list[LongTensor]): GT labels of each sample.
+
+        Returns:
+            dict: Losses from RoI RCNN head.
+                - loss_bbox (torch.Tensor): Loss of bboxes
+        """
+        features = feats_dict['features']
+        points = feats_dict['points']
+        point_cls_preds = feats_dict['points_cls_preds']
+        sem_scores = point_cls_preds.sigmoid()
+        point_scores = sem_scores.max(-1)[0]
+
+        sample_results = self._assign_and_sample(proposal_list, gt_bboxes_3d,
+                                                 gt_labels_3d)
+
+        # concat the depth, semantic features and backbone features
+        features = features.transpose(1, 2).contiguous()
+        point_depths = points.norm(dim=2) / self.depth_normalizer - 0.5
+        features_list = [
+            point_scores.unsqueeze(2),
+            point_depths.unsqueeze(2), features
+        ]
+        features = torch.cat(features_list, dim=2)
+
+        bbox_results = self._bbox_forward_train(features, points,
+                                                sample_results)
+        losses = dict()
+        losses.update(bbox_results['loss_bbox'])
+
+        return losses
+
+    def simple_test(self, feats_dict, img_metas, proposal_list, **kwargs):
+        """Simple testing forward function of PointRCNNRoIHead.
+
+        Note:
+            This function assumes that the batch size is 1
+
+        Args:
+            feats_dict (dict): Contains features from the first stage.
+            img_metas (list[dict]): Meta info of each image.
+            proposal_list (list[dict]): Proposal information from rpn.
+
+        Returns:
+            dict: Bbox results of one frame.
+        """
+        rois = bbox3d2roi([res['boxes_3d'].tensor for res in proposal_list])
+        labels_3d = [res['labels_3d'] for res in proposal_list]
+
+        features = feats_dict['features']
+        points = feats_dict['points']
+        point_cls_preds = feats_dict['points_cls_preds']
+        sem_scores = point_cls_preds.sigmoid()
+        point_scores = sem_scores.max(-1)[0]
+
+        features = features.transpose(1, 2).contiguous()
+        point_depths = points.norm(dim=2) / self.depth_normalizer - 0.5
+        features_list = [
+            point_scores.unsqueeze(2),
+            point_depths.unsqueeze(2), features
+        ]
+
+        features = torch.cat(features_list, dim=2)
+        batch_size = features.shape[0]
+        bbox_results = self._bbox_forward(features, points, batch_size, rois)
+        object_score = bbox_results['cls_score'].sigmoid()
+        bbox_list = self.bbox_head.get_bboxes(
+            rois,
+            object_score,
+            bbox_results['bbox_pred'],
+            labels_3d,
+            img_metas,
+            cfg=self.test_cfg)
+
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def _bbox_forward_train(self, features, points, sampling_results):
+        """Forward training function of roi_extractor and bbox_head.
+
+        Args:
+            features (torch.Tensor): Backbone features with depth and \
+                semantic features.
+            points (torch.Tensor): Pointcloud.
+            sampling_results (:obj:`SamplingResult`): Sampled results used
+                for training.
+
+        Returns:
+            dict: Forward results including losses and predictions.
+        """
+        rois = bbox3d2roi([res.bboxes for res in sampling_results])
+        batch_size = features.shape[0]
+        bbox_results = self._bbox_forward(features, points, batch_size, rois)
+        bbox_targets = self.bbox_head.get_targets(sampling_results,
+                                                  self.train_cfg)
+
+        loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
+                                        bbox_results['bbox_pred'], rois,
+                                        *bbox_targets)
+
+        bbox_results.update(loss_bbox=loss_bbox)
+        return bbox_results
+
+    def _bbox_forward(self, features, points, batch_size, rois):
+        """Forward function of roi_extractor and bbox_head used in both
+        training and testing.
+
+        Args:
+            features (torch.Tensor): Backbone features with depth and
+                semantic features.
+            points (torch.Tensor): Pointcloud.
+            batch_size (int): Batch size.
+            rois (torch.Tensor): RoI boxes.
+
+        Returns:
+            dict: Contains predictions of bbox_head and
+                features of roi_extractor.
+        """
+        pooled_point_feats = self.point_roi_extractor(features, points,
+                                                      batch_size, rois)
+
+        cls_score, bbox_pred = self.bbox_head(pooled_point_feats)
+        bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred)
+        return bbox_results
+
+    def _assign_and_sample(self, proposal_list, gt_bboxes_3d, gt_labels_3d):
+        """Assign and sample proposals for training.
+
+        Args:
+            proposal_list (list[dict]): Proposals produced by RPN.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels
+
+        Returns:
+            list[:obj:`SamplingResult`]: Sampled results of each training
+                sample.
+        """
+        sampling_results = []
+        # bbox assign
+        for batch_idx in range(len(proposal_list)):
+            cur_proposal_list = proposal_list[batch_idx]
+            cur_boxes = cur_proposal_list['boxes_3d']
+            cur_labels_3d = cur_proposal_list['labels_3d']
+            cur_gt_bboxes = gt_bboxes_3d[batch_idx].to(cur_boxes.device)
+            cur_gt_labels = gt_labels_3d[batch_idx]
+            batch_num_gts = 0
+            # 0 is bg
+            batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0)
+            batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes))
+            # -1 is bg
+            batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1)
+
+            # each class may have its own assigner
+            if isinstance(self.bbox_assigner, list):
+                for i, assigner in enumerate(self.bbox_assigner):
+                    gt_per_cls = (cur_gt_labels == i)
+                    pred_per_cls = (cur_labels_3d == i)
+                    cur_assign_res = assigner.assign(
+                        cur_boxes.tensor[pred_per_cls],
+                        cur_gt_bboxes.tensor[gt_per_cls],
+                        gt_labels=cur_gt_labels[gt_per_cls])
+                    # gather assign_results in different class into one result
+                    batch_num_gts += cur_assign_res.num_gts
+                    # gt inds (1-based)
+                    gt_inds_arange_pad = gt_per_cls.nonzero(
+                        as_tuple=False).view(-1) + 1
+                    # pad 0 for indice unassigned
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=0)
+                    # pad -1 for indice ignore
+                    gt_inds_arange_pad = F.pad(
+                        gt_inds_arange_pad, (1, 0), mode='constant', value=-1)
+                    # convert to 0~gt_num+2 for indices
+                    gt_inds_arange_pad += 1
+                    # now 0 is bg, >1 is fg in batch_gt_indis
+                    batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[
+                        cur_assign_res.gt_inds + 1] - 1
+                    batch_max_overlaps[
+                        pred_per_cls] = cur_assign_res.max_overlaps
+                    batch_gt_labels[pred_per_cls] = cur_assign_res.labels
+
+                assign_result = AssignResult(batch_num_gts, batch_gt_indis,
+                                             batch_max_overlaps,
+                                             batch_gt_labels)
+            else:  # for single class
+                assign_result = self.bbox_assigner.assign(
+                    cur_boxes.tensor,
+                    cur_gt_bboxes.tensor,
+                    gt_labels=cur_gt_labels)
+
+            # sample boxes
+            sampling_result = self.bbox_sampler.sample(assign_result,
+                                                       cur_boxes.tensor,
+                                                       cur_gt_bboxes.tensor,
+                                                       cur_gt_labels)
+            sampling_results.append(sampling_result)
+        return sampling_results
diff --git a/mmdet3d/models/roi_heads/roi_extractors/__init__.py b/mmdet3d/models/roi_heads/roi_extractors/__init__.py
index 70c2881..7d6ec28 100644
--- a/mmdet3d/models/roi_heads/roi_extractors/__init__.py
+++ b/mmdet3d/models/roi_heads/roi_extractors/__init__.py
@@ -1,9 +1,9 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmdet.models.roi_heads.roi_extractors import SingleRoIExtractor
-from .single_roiaware_extractor import Single3DRoIAwareExtractor
-from .single_roipoint_extractor import Single3DRoIPointExtractor
-
-__all__ = [
-    'SingleRoIExtractor', 'Single3DRoIAwareExtractor',
-    'Single3DRoIPointExtractor'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.models.roi_heads.roi_extractors import SingleRoIExtractor
+from .single_roiaware_extractor import Single3DRoIAwareExtractor
+from .single_roipoint_extractor import Single3DRoIPointExtractor
+
+__all__ = [
+    'SingleRoIExtractor', 'Single3DRoIAwareExtractor',
+    'Single3DRoIPointExtractor'
+]
diff --git a/mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py b/mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py
index c27a004..5f06e17 100644
--- a/mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py
+++ b/mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py
@@ -1,54 +1,54 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv import ops
-from mmcv.runner import BaseModule
-
-from mmdet3d.models.builder import ROI_EXTRACTORS
-
-
-@ROI_EXTRACTORS.register_module()
-class Single3DRoIAwareExtractor(BaseModule):
-    """Point-wise roi-aware Extractor.
-
-    Extract Point-wise roi features.
-
-    Args:
-        roi_layer (dict): The config of roi layer.
-    """
-
-    def __init__(self, roi_layer=None, init_cfg=None):
-        super(Single3DRoIAwareExtractor, self).__init__(init_cfg=init_cfg)
-        self.roi_layer = self.build_roi_layers(roi_layer)
-
-    def build_roi_layers(self, layer_cfg):
-        """Build roi layers using `layer_cfg`"""
-        cfg = layer_cfg.copy()
-        layer_type = cfg.pop('type')
-        assert hasattr(ops, layer_type)
-        layer_cls = getattr(ops, layer_type)
-        roi_layers = layer_cls(**cfg)
-        return roi_layers
-
-    def forward(self, feats, coordinate, batch_inds, rois):
-        """Extract point-wise roi features.
-
-        Args:
-            feats (torch.FloatTensor): Point-wise features with
-                shape (batch, npoints, channels) for pooling.
-            coordinate (torch.FloatTensor): Coordinate of each point.
-            batch_inds (torch.LongTensor): Indicate the batch of each point.
-            rois (torch.FloatTensor): Roi boxes with batch indices.
-
-        Returns:
-            torch.FloatTensor: Pooled features
-        """
-        pooled_roi_feats = []
-        for batch_idx in range(int(batch_inds.max()) + 1):
-            roi_inds = (rois[..., 0].int() == batch_idx)
-            coors_inds = (batch_inds.int() == batch_idx)
-            pooled_roi_feat = self.roi_layer(rois[..., 1:][roi_inds],
-                                             coordinate[coors_inds],
-                                             feats[coors_inds])
-            pooled_roi_feats.append(pooled_roi_feat)
-        pooled_roi_feats = torch.cat(pooled_roi_feats, 0)
-        return pooled_roi_feats
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv import ops
+from mmcv.runner import BaseModule
+
+from mmdet3d.models.builder import ROI_EXTRACTORS
+
+
+@ROI_EXTRACTORS.register_module()
+class Single3DRoIAwareExtractor(BaseModule):
+    """Point-wise roi-aware Extractor.
+
+    Extract Point-wise roi features.
+
+    Args:
+        roi_layer (dict): The config of roi layer.
+    """
+
+    def __init__(self, roi_layer=None, init_cfg=None):
+        super(Single3DRoIAwareExtractor, self).__init__(init_cfg=init_cfg)
+        self.roi_layer = self.build_roi_layers(roi_layer)
+
+    def build_roi_layers(self, layer_cfg):
+        """Build roi layers using `layer_cfg`"""
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        assert hasattr(ops, layer_type)
+        layer_cls = getattr(ops, layer_type)
+        roi_layers = layer_cls(**cfg)
+        return roi_layers
+
+    def forward(self, feats, coordinate, batch_inds, rois):
+        """Extract point-wise roi features.
+
+        Args:
+            feats (torch.FloatTensor): Point-wise features with
+                shape (batch, npoints, channels) for pooling.
+            coordinate (torch.FloatTensor): Coordinate of each point.
+            batch_inds (torch.LongTensor): Indicate the batch of each point.
+            rois (torch.FloatTensor): Roi boxes with batch indices.
+
+        Returns:
+            torch.FloatTensor: Pooled features
+        """
+        pooled_roi_feats = []
+        for batch_idx in range(int(batch_inds.max()) + 1):
+            roi_inds = (rois[..., 0].int() == batch_idx)
+            coors_inds = (batch_inds.int() == batch_idx)
+            pooled_roi_feat = self.roi_layer(rois[..., 1:][roi_inds],
+                                             coordinate[coors_inds],
+                                             feats[coors_inds])
+            pooled_roi_feats.append(pooled_roi_feat)
+        pooled_roi_feats = torch.cat(pooled_roi_feats, 0)
+        return pooled_roi_feats
diff --git a/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py b/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py
index 4983a01..8bbe6be 100644
--- a/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py
+++ b/mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py
@@ -1,64 +1,64 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv import ops
-from torch import nn as nn
-
-from mmdet3d.core.bbox.structures import rotation_3d_in_axis
-from mmdet3d.models.builder import ROI_EXTRACTORS
-
-
-@ROI_EXTRACTORS.register_module()
-class Single3DRoIPointExtractor(nn.Module):
-    """Point-wise roi-aware Extractor.
-
-    Extract Point-wise roi features.
-
-    Args:
-        roi_layer (dict): The config of roi layer.
-    """
-
-    def __init__(self, roi_layer=None):
-        super(Single3DRoIPointExtractor, self).__init__()
-        self.roi_layer = self.build_roi_layers(roi_layer)
-
-    def build_roi_layers(self, layer_cfg):
-        """Build roi layers using `layer_cfg`"""
-        cfg = layer_cfg.copy()
-        layer_type = cfg.pop('type')
-        assert hasattr(ops, layer_type)
-        layer_cls = getattr(ops, layer_type)
-        roi_layers = layer_cls(**cfg)
-        return roi_layers
-
-    def forward(self, feats, coordinate, batch_inds, rois):
-        """Extract point-wise roi features.
-
-        Args:
-            feats (torch.FloatTensor): Point-wise features with
-                shape (batch, npoints, channels) for pooling.
-            coordinate (torch.FloatTensor): Coordinate of each point.
-            batch_inds (torch.LongTensor): Indicate the batch of each point.
-            rois (torch.FloatTensor): Roi boxes with batch indices.
-
-        Returns:
-            torch.FloatTensor: Pooled features
-        """
-        rois = rois[..., 1:]
-        rois = rois.view(batch_inds, -1, rois.shape[-1])
-        with torch.no_grad():
-            pooled_roi_feat, pooled_empty_flag = self.roi_layer(
-                coordinate, feats, rois)
-
-            # canonical transformation
-            roi_center = rois[:, :, 0:3]
-            pooled_roi_feat[:, :, :, 0:3] -= roi_center.unsqueeze(dim=2)
-            pooled_roi_feat = pooled_roi_feat.view(-1,
-                                                   pooled_roi_feat.shape[-2],
-                                                   pooled_roi_feat.shape[-1])
-            pooled_roi_feat[:, :, 0:3] = rotation_3d_in_axis(
-                pooled_roi_feat[:, :, 0:3],
-                -(rois.view(-1, rois.shape[-1])[:, 6]),
-                axis=2)
-            pooled_roi_feat[pooled_empty_flag.view(-1) > 0] = 0
-
-        return pooled_roi_feat
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv import ops
+from torch import nn as nn
+
+from mmdet3d.core.bbox.structures import rotation_3d_in_axis
+from mmdet3d.models.builder import ROI_EXTRACTORS
+
+
+@ROI_EXTRACTORS.register_module()
+class Single3DRoIPointExtractor(nn.Module):
+    """Point-wise roi-aware Extractor.
+
+    Extract Point-wise roi features.
+
+    Args:
+        roi_layer (dict): The config of roi layer.
+    """
+
+    def __init__(self, roi_layer=None):
+        super(Single3DRoIPointExtractor, self).__init__()
+        self.roi_layer = self.build_roi_layers(roi_layer)
+
+    def build_roi_layers(self, layer_cfg):
+        """Build roi layers using `layer_cfg`"""
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        assert hasattr(ops, layer_type)
+        layer_cls = getattr(ops, layer_type)
+        roi_layers = layer_cls(**cfg)
+        return roi_layers
+
+    def forward(self, feats, coordinate, batch_inds, rois):
+        """Extract point-wise roi features.
+
+        Args:
+            feats (torch.FloatTensor): Point-wise features with
+                shape (batch, npoints, channels) for pooling.
+            coordinate (torch.FloatTensor): Coordinate of each point.
+            batch_inds (torch.LongTensor): Indicate the batch of each point.
+            rois (torch.FloatTensor): Roi boxes with batch indices.
+
+        Returns:
+            torch.FloatTensor: Pooled features
+        """
+        rois = rois[..., 1:]
+        rois = rois.view(batch_inds, -1, rois.shape[-1])
+        with torch.no_grad():
+            pooled_roi_feat, pooled_empty_flag = self.roi_layer(
+                coordinate, feats, rois)
+
+            # canonical transformation
+            roi_center = rois[:, :, 0:3]
+            pooled_roi_feat[:, :, :, 0:3] -= roi_center.unsqueeze(dim=2)
+            pooled_roi_feat = pooled_roi_feat.view(-1,
+                                                   pooled_roi_feat.shape[-2],
+                                                   pooled_roi_feat.shape[-1])
+            pooled_roi_feat[:, :, 0:3] = rotation_3d_in_axis(
+                pooled_roi_feat[:, :, 0:3],
+                -(rois.view(-1, rois.shape[-1])[:, 6]),
+                axis=2)
+            pooled_roi_feat[pooled_empty_flag.view(-1) > 0] = 0
+
+        return pooled_roi_feat
diff --git a/mmdet3d/models/segmentors/__init__.py b/mmdet3d/models/segmentors/__init__.py
index 29fbc33..014d218 100644
--- a/mmdet3d/models/segmentors/__init__.py
+++ b/mmdet3d/models/segmentors/__init__.py
@@ -1,5 +1,5 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base import Base3DSegmentor
-from .encoder_decoder import EncoderDecoder3D
-
-__all__ = ['Base3DSegmentor', 'EncoderDecoder3D']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import Base3DSegmentor
+from .encoder_decoder import EncoderDecoder3D
+
+__all__ = ['Base3DSegmentor', 'EncoderDecoder3D']
diff --git a/mmdet3d/models/segmentors/base.py b/mmdet3d/models/segmentors/base.py
index 9913698..0f4e72e 100644
--- a/mmdet3d/models/segmentors/base.py
+++ b/mmdet3d/models/segmentors/base.py
@@ -1,136 +1,136 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from os import path as osp
-
-import mmcv
-import numpy as np
-import torch
-from mmcv.parallel import DataContainer as DC
-from mmcv.runner import auto_fp16
-
-from mmdet3d.core import show_seg_result
-from mmseg.models.segmentors import BaseSegmentor
-
-
-class Base3DSegmentor(BaseSegmentor):
-    """Base class for 3D segmentors.
-
-    The main difference with `BaseSegmentor` is that we modify the keys in
-    data_dict and use a 3D seg specific visualization function.
-    """
-
-    @property
-    def with_regularization_loss(self):
-        """bool: whether the segmentor has regularization loss for weight"""
-        return hasattr(self, 'loss_regularization') and \
-            self.loss_regularization is not None
-
-    def forward_test(self, points, img_metas, **kwargs):
-        """Calls either simple_test or aug_test depending on the length of
-        outer list of points. If len(points) == 1, call simple_test. Otherwise
-        call aug_test to aggregate the test results by e.g. voting.
-
-        Args:
-            points (list[list[torch.Tensor]]): the outer list indicates
-                test-time augmentations and inner torch.Tensor should have a
-                shape BXNxC, which contains all points in the batch.
-            img_metas (list[list[dict]]): the outer list indicates test-time
-                augs (multiscale, flip, etc.) and the inner list indicates
-                images in a batch.
-        """
-        for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
-            if not isinstance(var, list):
-                raise TypeError(f'{name} must be a list, but got {type(var)}')
-
-        num_augs = len(points)
-        if num_augs != len(img_metas):
-            raise ValueError(f'num of augmentations ({len(points)}) != '
-                             f'num of image meta ({len(img_metas)})')
-
-        if num_augs == 1:
-            return self.simple_test(points[0], img_metas[0], **kwargs)
-        else:
-            return self.aug_test(points, img_metas, **kwargs)
-
-    @auto_fp16(apply_to=('points'))
-    def forward(self, return_loss=True, **kwargs):
-        """Calls either forward_train or forward_test depending on whether
-        return_loss=True.
-
-        Note this setting will change the expected inputs. When
-        `return_loss=True`, point and img_metas are single-nested (i.e.
-        torch.Tensor and list[dict]), and when `resturn_loss=False`, point and
-        img_metas should be double nested (i.e.  list[torch.Tensor],
-        list[list[dict]]), with the outer list indicating test time
-        augmentations.
-        """
-        if return_loss:
-            return self.forward_train(**kwargs)
-        else:
-            return self.forward_test(**kwargs)
-
-    def show_results(self,
-                     data,
-                     result,
-                     palette=None,
-                     out_dir=None,
-                     ignore_index=None,
-                     show=False,
-                     score_thr=None):
-        """Results visualization.
-
-        Args:
-            data (list[dict]): Input points and the information of the sample.
-            result (list[dict]): Prediction results.
-            palette (list[list[int]]] | np.ndarray): The palette of
-                segmentation map. If None is given, random palette will be
-                generated. Default: None
-            out_dir (str): Output directory of visualization result.
-            ignore_index (int, optional): The label index to be ignored, e.g.
-                unannotated points. If None is given, set to len(self.CLASSES).
-                Defaults to None.
-            show (bool, optional): Determines whether you are
-                going to show result by open3d.
-                Defaults to False.
-            TODO: implement score_thr of Base3DSegmentor.
-            score_thr (float, optional): Score threshold of bounding boxes.
-                Default to None.
-                Not implemented yet, but it is here for unification.
-        """
-        assert out_dir is not None, 'Expect out_dir, got none.'
-        if palette is None:
-            if self.PALETTE is None:
-                palette = np.random.randint(
-                    0, 255, size=(len(self.CLASSES), 3))
-            else:
-                palette = self.PALETTE
-        palette = np.array(palette)
-        for batch_id in range(len(result)):
-            if isinstance(data['points'][0], DC):
-                points = data['points'][0]._data[0][batch_id].numpy()
-            elif mmcv.is_list_of(data['points'][0], torch.Tensor):
-                points = data['points'][0][batch_id]
-            else:
-                ValueError(f"Unsupported data type {type(data['points'][0])} "
-                           f'for visualization!')
-            if isinstance(data['img_metas'][0], DC):
-                pts_filename = data['img_metas'][0]._data[0][batch_id][
-                    'pts_filename']
-            elif mmcv.is_list_of(data['img_metas'][0], dict):
-                pts_filename = data['img_metas'][0][batch_id]['pts_filename']
-            else:
-                ValueError(
-                    f"Unsupported data type {type(data['img_metas'][0])} "
-                    f'for visualization!')
-            file_name = osp.split(pts_filename)[-1].split('.')[0]
-
-            pred_sem_mask = result[batch_id]['semantic_mask'].cpu().numpy()
-
-            show_seg_result(
-                points,
-                None,
-                pred_sem_mask,
-                out_dir,
-                file_name,
-                palette,
-                ignore_index,
-                show=show)
+# Copyright (c) OpenMMLab. All rights reserved.
+from os import path as osp
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+from mmcv.runner import auto_fp16
+
+from mmdet3d.core import show_seg_result
+from mmseg.models.segmentors import BaseSegmentor
+
+
+class Base3DSegmentor(BaseSegmentor):
+    """Base class for 3D segmentors.
+
+    The main difference with `BaseSegmentor` is that we modify the keys in
+    data_dict and use a 3D seg specific visualization function.
+    """
+
+    @property
+    def with_regularization_loss(self):
+        """bool: whether the segmentor has regularization loss for weight"""
+        return hasattr(self, 'loss_regularization') and \
+            self.loss_regularization is not None
+
+    def forward_test(self, points, img_metas, **kwargs):
+        """Calls either simple_test or aug_test depending on the length of
+        outer list of points. If len(points) == 1, call simple_test. Otherwise
+        call aug_test to aggregate the test results by e.g. voting.
+
+        Args:
+            points (list[list[torch.Tensor]]): the outer list indicates
+                test-time augmentations and inner torch.Tensor should have a
+                shape BXNxC, which contains all points in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got {type(var)}')
+
+        num_augs = len(points)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(points)}) != '
+                             f'num of image meta ({len(img_metas)})')
+
+        if num_augs == 1:
+            return self.simple_test(points[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(points, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=('points'))
+    def forward(self, return_loss=True, **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, point and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, point and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+
+    def show_results(self,
+                     data,
+                     result,
+                     palette=None,
+                     out_dir=None,
+                     ignore_index=None,
+                     show=False,
+                     score_thr=None):
+        """Results visualization.
+
+        Args:
+            data (list[dict]): Input points and the information of the sample.
+            result (list[dict]): Prediction results.
+            palette (list[list[int]]] | np.ndarray): The palette of
+                segmentation map. If None is given, random palette will be
+                generated. Default: None
+            out_dir (str): Output directory of visualization result.
+            ignore_index (int, optional): The label index to be ignored, e.g.
+                unannotated points. If None is given, set to len(self.CLASSES).
+                Defaults to None.
+            show (bool, optional): Determines whether you are
+                going to show result by open3d.
+                Defaults to False.
+            TODO: implement score_thr of Base3DSegmentor.
+            score_thr (float, optional): Score threshold of bounding boxes.
+                Default to None.
+                Not implemented yet, but it is here for unification.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        if palette is None:
+            if self.PALETTE is None:
+                palette = np.random.randint(
+                    0, 255, size=(len(self.CLASSES), 3))
+            else:
+                palette = self.PALETTE
+        palette = np.array(palette)
+        for batch_id in range(len(result)):
+            if isinstance(data['points'][0], DC):
+                points = data['points'][0]._data[0][batch_id].numpy()
+            elif mmcv.is_list_of(data['points'][0], torch.Tensor):
+                points = data['points'][0][batch_id]
+            else:
+                ValueError(f"Unsupported data type {type(data['points'][0])} "
+                           f'for visualization!')
+            if isinstance(data['img_metas'][0], DC):
+                pts_filename = data['img_metas'][0]._data[0][batch_id][
+                    'pts_filename']
+            elif mmcv.is_list_of(data['img_metas'][0], dict):
+                pts_filename = data['img_metas'][0][batch_id]['pts_filename']
+            else:
+                ValueError(
+                    f"Unsupported data type {type(data['img_metas'][0])} "
+                    f'for visualization!')
+            file_name = osp.split(pts_filename)[-1].split('.')[0]
+
+            pred_sem_mask = result[batch_id]['semantic_mask'].cpu().numpy()
+
+            show_seg_result(
+                points,
+                None,
+                pred_sem_mask,
+                out_dir,
+                file_name,
+                palette,
+                ignore_index,
+                show=show)
diff --git a/mmdet3d/models/segmentors/encoder_decoder.py b/mmdet3d/models/segmentors/encoder_decoder.py
index 1a4fee9..794633e 100644
--- a/mmdet3d/models/segmentors/encoder_decoder.py
+++ b/mmdet3d/models/segmentors/encoder_decoder.py
@@ -1,454 +1,454 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-from torch import nn as nn
-from torch.nn import functional as F
-
-from mmseg.core import add_prefix
-from ..builder import (SEGMENTORS, build_backbone, build_head, build_loss,
-                       build_neck)
-from .base import Base3DSegmentor
-
-
-@SEGMENTORS.register_module()
-class EncoderDecoder3D(Base3DSegmentor):
-    """3D Encoder Decoder segmentors.
-
-    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
-    Note that auxiliary_head is only used for deep supervision during training,
-    which could be thrown during inference.
-    """
-
-    def __init__(self,
-                 backbone,
-                 decode_head,
-                 neck=None,
-                 auxiliary_head=None,
-                 loss_regularization=None,
-                 train_cfg=None,
-                 test_cfg=None,
-                 pretrained=None,
-                 init_cfg=None):
-        super(EncoderDecoder3D, self).__init__(init_cfg=init_cfg)
-        self.backbone = build_backbone(backbone)
-        if neck is not None:
-            self.neck = build_neck(neck)
-        self._init_decode_head(decode_head)
-        self._init_auxiliary_head(auxiliary_head)
-        self._init_loss_regularization(loss_regularization)
-
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        assert self.with_decode_head, \
-            '3D EncoderDecoder Segmentor should have a decode_head'
-
-    def _init_decode_head(self, decode_head):
-        """Initialize ``decode_head``"""
-        self.decode_head = build_head(decode_head)
-        self.num_classes = self.decode_head.num_classes
-
-    def _init_auxiliary_head(self, auxiliary_head):
-        """Initialize ``auxiliary_head``"""
-        if auxiliary_head is not None:
-            if isinstance(auxiliary_head, list):
-                self.auxiliary_head = nn.ModuleList()
-                for head_cfg in auxiliary_head:
-                    self.auxiliary_head.append(build_head(head_cfg))
-            else:
-                self.auxiliary_head = build_head(auxiliary_head)
-
-    def _init_loss_regularization(self, loss_regularization):
-        """Initialize ``loss_regularization``"""
-        if loss_regularization is not None:
-            if isinstance(loss_regularization, list):
-                self.loss_regularization = nn.ModuleList()
-                for loss_cfg in loss_regularization:
-                    self.loss_regularization.append(build_loss(loss_cfg))
-            else:
-                self.loss_regularization = build_loss(loss_regularization)
-
-    def extract_feat(self, points):
-        """Extract features from points."""
-        x = self.backbone(points)
-        if self.with_neck:
-            x = self.neck(x)
-        return x
-
-    def encode_decode(self, points, img_metas):
-        """Encode points with backbone and decode into a semantic segmentation
-        map of the same size as input.
-
-        Args:
-            points (torch.Tensor): Input points of shape [B, N, 3+C].
-            img_metas (list[dict]): Meta information of each sample.
-
-        Returns:
-            torch.Tensor: Segmentation logits of shape [B, num_classes, N].
-        """
-        x = self.extract_feat(points)
-        out = self._decode_head_forward_test(x, img_metas)
-        return out
-
-    def _decode_head_forward_train(self, x, img_metas, pts_semantic_mask):
-        """Run forward function and calculate loss for decode head in
-        training."""
-        losses = dict()
-        loss_decode = self.decode_head.forward_train(x, img_metas,
-                                                     pts_semantic_mask,
-                                                     self.train_cfg)
-
-        losses.update(add_prefix(loss_decode, 'decode'))
-        return losses
-
-    def _decode_head_forward_test(self, x, img_metas):
-        """Run forward function and calculate loss for decode head in
-        inference."""
-        seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg)
-        return seg_logits
-
-    def _auxiliary_head_forward_train(self, x, img_metas, pts_semantic_mask):
-        """Run forward function and calculate loss for auxiliary head in
-        training."""
-        losses = dict()
-        if isinstance(self.auxiliary_head, nn.ModuleList):
-            for idx, aux_head in enumerate(self.auxiliary_head):
-                loss_aux = aux_head.forward_train(x, img_metas,
-                                                  pts_semantic_mask,
-                                                  self.train_cfg)
-                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
-        else:
-            loss_aux = self.auxiliary_head.forward_train(
-                x, img_metas, pts_semantic_mask, self.train_cfg)
-            losses.update(add_prefix(loss_aux, 'aux'))
-
-        return losses
-
-    def _loss_regularization_forward_train(self):
-        """Calculate regularization loss for model weight in training."""
-        losses = dict()
-        if isinstance(self.loss_regularization, nn.ModuleList):
-            for idx, regularize_loss in enumerate(self.loss_regularization):
-                loss_regularize = dict(
-                    loss_regularize=regularize_loss(self.modules()))
-                losses.update(add_prefix(loss_regularize, f'regularize_{idx}'))
-        else:
-            loss_regularize = dict(
-                loss_regularize=self.loss_regularization(self.modules()))
-            losses.update(add_prefix(loss_regularize, 'regularize'))
-
-        return losses
-
-    def forward_dummy(self, points):
-        """Dummy forward function."""
-        seg_logit = self.encode_decode(points, None)
-
-        return seg_logit
-
-    def forward_train(self, points, img_metas, pts_semantic_mask):
-        """Forward function for training.
-
-        Args:
-            points (list[torch.Tensor]): List of points of shape [N, C].
-            img_metas (list): Image metas.
-            pts_semantic_mask (list[torch.Tensor]): List of point-wise semantic
-                labels of shape [N].
-
-        Returns:
-            dict[str, Tensor]: Losses.
-        """
-        points_cat = torch.stack(points)
-        pts_semantic_mask_cat = torch.stack(pts_semantic_mask)
-
-        # extract features using backbone
-        x = self.extract_feat(points_cat)
-
-        losses = dict()
-
-        loss_decode = self._decode_head_forward_train(x, img_metas,
-                                                      pts_semantic_mask_cat)
-        losses.update(loss_decode)
-
-        if self.with_auxiliary_head:
-            loss_aux = self._auxiliary_head_forward_train(
-                x, img_metas, pts_semantic_mask_cat)
-            losses.update(loss_aux)
-
-        if self.with_regularization_loss:
-            loss_regularize = self._loss_regularization_forward_train()
-            losses.update(loss_regularize)
-
-        return losses
-
-    @staticmethod
-    def _input_generation(coords,
-                          patch_center,
-                          coord_max,
-                          feats,
-                          use_normalized_coord=False):
-        """Generating model input.
-
-        Generate input by subtracting patch center and adding additional
-            features. Currently support colors and normalized xyz as features.
-
-        Args:
-            coords (torch.Tensor): Sampled 3D point coordinate of shape [S, 3].
-            patch_center (torch.Tensor): Center coordinate of the patch.
-            coord_max (torch.Tensor): Max coordinate of all 3D points.
-            feats (torch.Tensor): Features of sampled points of shape [S, C].
-            use_normalized_coord (bool, optional): Whether to use normalized
-                xyz as additional features. Defaults to False.
-
-        Returns:
-            torch.Tensor: The generated input data of shape [S, 3+C'].
-        """
-        # subtract patch center, the z dimension is not centered
-        centered_coords = coords.clone()
-        centered_coords[:, 0] -= patch_center[0]
-        centered_coords[:, 1] -= patch_center[1]
-
-        # normalized coordinates as extra features
-        if use_normalized_coord:
-            normalized_coord = coords / coord_max
-            feats = torch.cat([feats, normalized_coord], dim=1)
-
-        points = torch.cat([centered_coords, feats], dim=1)
-
-        return points
-
-    def _sliding_patch_generation(self,
-                                  points,
-                                  num_points,
-                                  block_size,
-                                  sample_rate=0.5,
-                                  use_normalized_coord=False,
-                                  eps=1e-3):
-        """Sampling points in a sliding window fashion.
-
-        First sample patches to cover all the input points.
-        Then sample points in each patch to batch points of a certain number.
-
-        Args:
-            points (torch.Tensor): Input points of shape [N, 3+C].
-            num_points (int): Number of points to be sampled in each patch.
-            block_size (float, optional): Size of a patch to sample.
-            sample_rate (float, optional): Stride used in sliding patch.
-                Defaults to 0.5.
-            use_normalized_coord (bool, optional): Whether to use normalized
-                xyz as additional features. Defaults to False.
-            eps (float, optional): A value added to patch boundary to guarantee
-                points coverage. Defaults to 1e-3.
-
-        Returns:
-            np.ndarray | np.ndarray:
-
-                - patch_points (torch.Tensor): Points of different patches of
-                    shape [K, N, 3+C].
-                - patch_idxs (torch.Tensor): Index of each point in
-                    `patch_points`, of shape [K, N].
-        """
-        device = points.device
-        # we assume the first three dims are points' 3D coordinates
-        # and the rest dims are their per-point features
-        coords = points[:, :3]
-        feats = points[:, 3:]
-
-        coord_max = coords.max(0)[0]
-        coord_min = coords.min(0)[0]
-        stride = block_size * sample_rate
-        num_grid_x = int(
-            torch.ceil((coord_max[0] - coord_min[0] - block_size) /
-                       stride).item() + 1)
-        num_grid_y = int(
-            torch.ceil((coord_max[1] - coord_min[1] - block_size) /
-                       stride).item() + 1)
-
-        patch_points, patch_idxs = [], []
-        for idx_y in range(num_grid_y):
-            s_y = coord_min[1] + idx_y * stride
-            e_y = torch.min(s_y + block_size, coord_max[1])
-            s_y = e_y - block_size
-            for idx_x in range(num_grid_x):
-                s_x = coord_min[0] + idx_x * stride
-                e_x = torch.min(s_x + block_size, coord_max[0])
-                s_x = e_x - block_size
-
-                # extract points within this patch
-                cur_min = torch.tensor([s_x, s_y, coord_min[2]]).to(device)
-                cur_max = torch.tensor([e_x, e_y, coord_max[2]]).to(device)
-                cur_choice = ((coords >= cur_min - eps) &
-                              (coords <= cur_max + eps)).all(dim=1)
-
-                if not cur_choice.any():  # no points in this patch
-                    continue
-
-                # sample points in this patch to multiple batches
-                cur_center = cur_min + block_size / 2.0
-                point_idxs = torch.nonzero(cur_choice, as_tuple=True)[0]
-                num_batch = int(np.ceil(point_idxs.shape[0] / num_points))
-                point_size = int(num_batch * num_points)
-                replace = point_size > 2 * point_idxs.shape[0]
-                num_repeat = point_size - point_idxs.shape[0]
-                if replace:  # duplicate
-                    point_idxs_repeat = point_idxs[torch.randint(
-                        0, point_idxs.shape[0],
-                        size=(num_repeat, )).to(device)]
-                else:
-                    point_idxs_repeat = point_idxs[torch.randperm(
-                        point_idxs.shape[0])[:num_repeat]]
-
-                choices = torch.cat([point_idxs, point_idxs_repeat], dim=0)
-                choices = choices[torch.randperm(choices.shape[0])]
-
-                # construct model input
-                point_batches = self._input_generation(
-                    coords[choices],
-                    cur_center,
-                    coord_max,
-                    feats[choices],
-                    use_normalized_coord=use_normalized_coord)
-
-                patch_points.append(point_batches)
-                patch_idxs.append(choices)
-
-        patch_points = torch.cat(patch_points, dim=0)
-        patch_idxs = torch.cat(patch_idxs, dim=0)
-
-        # make sure all points are sampled at least once
-        assert torch.unique(patch_idxs).shape[0] == points.shape[0], \
-            'some points are not sampled in sliding inference'
-
-        return patch_points, patch_idxs
-
-    def slide_inference(self, point, img_meta, rescale):
-        """Inference by sliding-window with overlap.
-
-        Args:
-            point (torch.Tensor): Input points of shape [N, 3+C].
-            img_meta (dict): Meta information of input sample.
-            rescale (bool): Whether transform to original number of points.
-                Will be used for voxelization based segmentors.
-
-        Returns:
-            Tensor: The output segmentation map of shape [num_classes, N].
-        """
-        num_points = self.test_cfg.num_points
-        block_size = self.test_cfg.block_size
-        sample_rate = self.test_cfg.sample_rate
-        use_normalized_coord = self.test_cfg.use_normalized_coord
-        batch_size = self.test_cfg.batch_size * num_points
-
-        # patch_points is of shape [K*N, 3+C], patch_idxs is of shape [K*N]
-        patch_points, patch_idxs = self._sliding_patch_generation(
-            point, num_points, block_size, sample_rate, use_normalized_coord)
-        feats_dim = patch_points.shape[1]
-        seg_logits = []  # save patch predictions
-
-        for batch_idx in range(0, patch_points.shape[0], batch_size):
-            batch_points = patch_points[batch_idx:batch_idx + batch_size]
-            batch_points = batch_points.view(-1, num_points, feats_dim)
-            # batch_seg_logit is of shape [B, num_classes, N]
-            batch_seg_logit = self.encode_decode(batch_points, img_meta)
-            batch_seg_logit = batch_seg_logit.transpose(1, 2).contiguous()
-            seg_logits.append(batch_seg_logit.view(-1, self.num_classes))
-
-        # aggregate per-point logits by indexing sum and dividing count
-        seg_logits = torch.cat(seg_logits, dim=0)  # [K*N, num_classes]
-        expand_patch_idxs = patch_idxs.unsqueeze(1).repeat(1, self.num_classes)
-        preds = point.new_zeros((point.shape[0], self.num_classes)).\
-            scatter_add_(dim=0, index=expand_patch_idxs, src=seg_logits)
-        count_mat = torch.bincount(patch_idxs)
-        preds = preds / count_mat[:, None]
-
-        # TODO: if rescale and voxelization segmentor
-
-        return preds.transpose(0, 1)  # to [num_classes, K*N]
-
-    def whole_inference(self, points, img_metas, rescale):
-        """Inference with full scene (one forward pass without sliding)."""
-        seg_logit = self.encode_decode(points, img_metas)
-        # TODO: if rescale and voxelization segmentor
-        return seg_logit
-
-    def inference(self, points, img_metas, rescale):
-        """Inference with slide/whole style.
-
-        Args:
-            points (torch.Tensor): Input points of shape [B, N, 3+C].
-            img_metas (list[dict]): Meta information of each sample.
-            rescale (bool): Whether transform to original number of points.
-                Will be used for voxelization based segmentors.
-
-        Returns:
-            Tensor: The output segmentation map.
-        """
-        assert self.test_cfg.mode in ['slide', 'whole']
-        if self.test_cfg.mode == 'slide':
-            seg_logit = torch.stack([
-                self.slide_inference(point, img_meta, rescale)
-                for point, img_meta in zip(points, img_metas)
-            ], 0)
-        else:
-            seg_logit = self.whole_inference(points, img_metas, rescale)
-        output = F.softmax(seg_logit, dim=1)
-        return output
-
-    def simple_test(self, points, img_metas, rescale=True):
-        """Simple test with single scene.
-
-        Args:
-            points (list[torch.Tensor]): List of points of shape [N, 3+C].
-            img_metas (list[dict]): Meta information of each sample.
-            rescale (bool): Whether transform to original number of points.
-                Will be used for voxelization based segmentors.
-                Defaults to True.
-
-        Returns:
-            list[dict]: The output prediction result with following keys:
-
-                - semantic_mask (Tensor): Segmentation mask of shape [N].
-        """
-        # 3D segmentation requires per-point prediction, so it's impossible
-        # to use down-sampling to get a batch of scenes with same num_points
-        # therefore, we only support testing one scene every time
-        seg_pred = []
-        for point, img_meta in zip(points, img_metas):
-            seg_prob = self.inference(point.unsqueeze(0), [img_meta],
-                                      rescale)[0]
-            seg_map = seg_prob.argmax(0)  # [N]
-            # to cpu tensor for consistency with det3d
-            seg_map = seg_map.cpu()
-            seg_pred.append(seg_map)
-        # warp in dict
-        seg_pred = [dict(semantic_mask=seg_map) for seg_map in seg_pred]
-        return seg_pred
-
-    def aug_test(self, points, img_metas, rescale=True):
-        """Test with augmentations.
-
-        Args:
-            points (list[torch.Tensor]): List of points of shape [B, N, 3+C].
-            img_metas (list[list[dict]]): Meta information of each sample.
-                Outer list are different samples while inner is different augs.
-            rescale (bool): Whether transform to original number of points.
-                Will be used for voxelization based segmentors.
-                Defaults to True.
-
-        Returns:
-            list[dict]: The output prediction result with following keys:
-
-                - semantic_mask (Tensor): Segmentation mask of shape [N].
-        """
-        # in aug_test, one scene going through different augmentations could
-        # have the same number of points and are stacked as a batch
-        # to save memory, we get augmented seg logit inplace
-        seg_pred = []
-        for point, img_meta in zip(points, img_metas):
-            seg_prob = self.inference(point, img_meta, rescale)
-            seg_prob = seg_prob.mean(0)  # [num_classes, N]
-            seg_map = seg_prob.argmax(0)  # [N]
-            # to cpu tensor for consistency with det3d
-            seg_map = seg_map.cpu()
-            seg_pred.append(seg_map)
-        # warp in dict
-        seg_pred = [dict(semantic_mask=seg_map) for seg_map in seg_pred]
-        return seg_pred
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmseg.core import add_prefix
+from ..builder import (SEGMENTORS, build_backbone, build_head, build_loss,
+                       build_neck)
+from .base import Base3DSegmentor
+
+
+@SEGMENTORS.register_module()
+class EncoderDecoder3D(Base3DSegmentor):
+    """3D Encoder Decoder segmentors.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be thrown during inference.
+    """
+
+    def __init__(self,
+                 backbone,
+                 decode_head,
+                 neck=None,
+                 auxiliary_head=None,
+                 loss_regularization=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(EncoderDecoder3D, self).__init__(init_cfg=init_cfg)
+        self.backbone = build_backbone(backbone)
+        if neck is not None:
+            self.neck = build_neck(neck)
+        self._init_decode_head(decode_head)
+        self._init_auxiliary_head(auxiliary_head)
+        self._init_loss_regularization(loss_regularization)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        assert self.with_decode_head, \
+            '3D EncoderDecoder Segmentor should have a decode_head'
+
+    def _init_decode_head(self, decode_head):
+        """Initialize ``decode_head``"""
+        self.decode_head = build_head(decode_head)
+        self.num_classes = self.decode_head.num_classes
+
+    def _init_auxiliary_head(self, auxiliary_head):
+        """Initialize ``auxiliary_head``"""
+        if auxiliary_head is not None:
+            if isinstance(auxiliary_head, list):
+                self.auxiliary_head = nn.ModuleList()
+                for head_cfg in auxiliary_head:
+                    self.auxiliary_head.append(build_head(head_cfg))
+            else:
+                self.auxiliary_head = build_head(auxiliary_head)
+
+    def _init_loss_regularization(self, loss_regularization):
+        """Initialize ``loss_regularization``"""
+        if loss_regularization is not None:
+            if isinstance(loss_regularization, list):
+                self.loss_regularization = nn.ModuleList()
+                for loss_cfg in loss_regularization:
+                    self.loss_regularization.append(build_loss(loss_cfg))
+            else:
+                self.loss_regularization = build_loss(loss_regularization)
+
+    def extract_feat(self, points):
+        """Extract features from points."""
+        x = self.backbone(points)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, points, img_metas):
+        """Encode points with backbone and decode into a semantic segmentation
+        map of the same size as input.
+
+        Args:
+            points (torch.Tensor): Input points of shape [B, N, 3+C].
+            img_metas (list[dict]): Meta information of each sample.
+
+        Returns:
+            torch.Tensor: Segmentation logits of shape [B, num_classes, N].
+        """
+        x = self.extract_feat(points)
+        out = self._decode_head_forward_test(x, img_metas)
+        return out
+
+    def _decode_head_forward_train(self, x, img_metas, pts_semantic_mask):
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.forward_train(x, img_metas,
+                                                     pts_semantic_mask,
+                                                     self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _decode_head_forward_test(self, x, img_metas):
+        """Run forward function and calculate loss for decode head in
+        inference."""
+        seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg)
+        return seg_logits
+
+    def _auxiliary_head_forward_train(self, x, img_metas, pts_semantic_mask):
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.forward_train(x, img_metas,
+                                                  pts_semantic_mask,
+                                                  self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.forward_train(
+                x, img_metas, pts_semantic_mask, self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def _loss_regularization_forward_train(self):
+        """Calculate regularization loss for model weight in training."""
+        losses = dict()
+        if isinstance(self.loss_regularization, nn.ModuleList):
+            for idx, regularize_loss in enumerate(self.loss_regularization):
+                loss_regularize = dict(
+                    loss_regularize=regularize_loss(self.modules()))
+                losses.update(add_prefix(loss_regularize, f'regularize_{idx}'))
+        else:
+            loss_regularize = dict(
+                loss_regularize=self.loss_regularization(self.modules()))
+            losses.update(add_prefix(loss_regularize, 'regularize'))
+
+        return losses
+
+    def forward_dummy(self, points):
+        """Dummy forward function."""
+        seg_logit = self.encode_decode(points, None)
+
+        return seg_logit
+
+    def forward_train(self, points, img_metas, pts_semantic_mask):
+        """Forward function for training.
+
+        Args:
+            points (list[torch.Tensor]): List of points of shape [N, C].
+            img_metas (list): Image metas.
+            pts_semantic_mask (list[torch.Tensor]): List of point-wise semantic
+                labels of shape [N].
+
+        Returns:
+            dict[str, Tensor]: Losses.
+        """
+        points_cat = torch.stack(points)
+        pts_semantic_mask_cat = torch.stack(pts_semantic_mask)
+
+        # extract features using backbone
+        x = self.extract_feat(points_cat)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, img_metas,
+                                                      pts_semantic_mask_cat)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(
+                x, img_metas, pts_semantic_mask_cat)
+            losses.update(loss_aux)
+
+        if self.with_regularization_loss:
+            loss_regularize = self._loss_regularization_forward_train()
+            losses.update(loss_regularize)
+
+        return losses
+
+    @staticmethod
+    def _input_generation(coords,
+                          patch_center,
+                          coord_max,
+                          feats,
+                          use_normalized_coord=False):
+        """Generating model input.
+
+        Generate input by subtracting patch center and adding additional
+            features. Currently support colors and normalized xyz as features.
+
+        Args:
+            coords (torch.Tensor): Sampled 3D point coordinate of shape [S, 3].
+            patch_center (torch.Tensor): Center coordinate of the patch.
+            coord_max (torch.Tensor): Max coordinate of all 3D points.
+            feats (torch.Tensor): Features of sampled points of shape [S, C].
+            use_normalized_coord (bool, optional): Whether to use normalized
+                xyz as additional features. Defaults to False.
+
+        Returns:
+            torch.Tensor: The generated input data of shape [S, 3+C'].
+        """
+        # subtract patch center, the z dimension is not centered
+        centered_coords = coords.clone()
+        centered_coords[:, 0] -= patch_center[0]
+        centered_coords[:, 1] -= patch_center[1]
+
+        # normalized coordinates as extra features
+        if use_normalized_coord:
+            normalized_coord = coords / coord_max
+            feats = torch.cat([feats, normalized_coord], dim=1)
+
+        points = torch.cat([centered_coords, feats], dim=1)
+
+        return points
+
+    def _sliding_patch_generation(self,
+                                  points,
+                                  num_points,
+                                  block_size,
+                                  sample_rate=0.5,
+                                  use_normalized_coord=False,
+                                  eps=1e-3):
+        """Sampling points in a sliding window fashion.
+
+        First sample patches to cover all the input points.
+        Then sample points in each patch to batch points of a certain number.
+
+        Args:
+            points (torch.Tensor): Input points of shape [N, 3+C].
+            num_points (int): Number of points to be sampled in each patch.
+            block_size (float, optional): Size of a patch to sample.
+            sample_rate (float, optional): Stride used in sliding patch.
+                Defaults to 0.5.
+            use_normalized_coord (bool, optional): Whether to use normalized
+                xyz as additional features. Defaults to False.
+            eps (float, optional): A value added to patch boundary to guarantee
+                points coverage. Defaults to 1e-3.
+
+        Returns:
+            np.ndarray | np.ndarray:
+
+                - patch_points (torch.Tensor): Points of different patches of
+                    shape [K, N, 3+C].
+                - patch_idxs (torch.Tensor): Index of each point in
+                    `patch_points`, of shape [K, N].
+        """
+        device = points.device
+        # we assume the first three dims are points' 3D coordinates
+        # and the rest dims are their per-point features
+        coords = points[:, :3]
+        feats = points[:, 3:]
+
+        coord_max = coords.max(0)[0]
+        coord_min = coords.min(0)[0]
+        stride = block_size * sample_rate
+        num_grid_x = int(
+            torch.ceil((coord_max[0] - coord_min[0] - block_size) /
+                       stride).item() + 1)
+        num_grid_y = int(
+            torch.ceil((coord_max[1] - coord_min[1] - block_size) /
+                       stride).item() + 1)
+
+        patch_points, patch_idxs = [], []
+        for idx_y in range(num_grid_y):
+            s_y = coord_min[1] + idx_y * stride
+            e_y = torch.min(s_y + block_size, coord_max[1])
+            s_y = e_y - block_size
+            for idx_x in range(num_grid_x):
+                s_x = coord_min[0] + idx_x * stride
+                e_x = torch.min(s_x + block_size, coord_max[0])
+                s_x = e_x - block_size
+
+                # extract points within this patch
+                cur_min = torch.tensor([s_x, s_y, coord_min[2]]).to(device)
+                cur_max = torch.tensor([e_x, e_y, coord_max[2]]).to(device)
+                cur_choice = ((coords >= cur_min - eps) &
+                              (coords <= cur_max + eps)).all(dim=1)
+
+                if not cur_choice.any():  # no points in this patch
+                    continue
+
+                # sample points in this patch to multiple batches
+                cur_center = cur_min + block_size / 2.0
+                point_idxs = torch.nonzero(cur_choice, as_tuple=True)[0]
+                num_batch = int(np.ceil(point_idxs.shape[0] / num_points))
+                point_size = int(num_batch * num_points)
+                replace = point_size > 2 * point_idxs.shape[0]
+                num_repeat = point_size - point_idxs.shape[0]
+                if replace:  # duplicate
+                    point_idxs_repeat = point_idxs[torch.randint(
+                        0, point_idxs.shape[0],
+                        size=(num_repeat, )).to(device)]
+                else:
+                    point_idxs_repeat = point_idxs[torch.randperm(
+                        point_idxs.shape[0])[:num_repeat]]
+
+                choices = torch.cat([point_idxs, point_idxs_repeat], dim=0)
+                choices = choices[torch.randperm(choices.shape[0])]
+
+                # construct model input
+                point_batches = self._input_generation(
+                    coords[choices],
+                    cur_center,
+                    coord_max,
+                    feats[choices],
+                    use_normalized_coord=use_normalized_coord)
+
+                patch_points.append(point_batches)
+                patch_idxs.append(choices)
+
+        patch_points = torch.cat(patch_points, dim=0)
+        patch_idxs = torch.cat(patch_idxs, dim=0)
+
+        # make sure all points are sampled at least once
+        assert torch.unique(patch_idxs).shape[0] == points.shape[0], \
+            'some points are not sampled in sliding inference'
+
+        return patch_points, patch_idxs
+
+    def slide_inference(self, point, img_meta, rescale):
+        """Inference by sliding-window with overlap.
+
+        Args:
+            point (torch.Tensor): Input points of shape [N, 3+C].
+            img_meta (dict): Meta information of input sample.
+            rescale (bool): Whether transform to original number of points.
+                Will be used for voxelization based segmentors.
+
+        Returns:
+            Tensor: The output segmentation map of shape [num_classes, N].
+        """
+        num_points = self.test_cfg.num_points
+        block_size = self.test_cfg.block_size
+        sample_rate = self.test_cfg.sample_rate
+        use_normalized_coord = self.test_cfg.use_normalized_coord
+        batch_size = self.test_cfg.batch_size * num_points
+
+        # patch_points is of shape [K*N, 3+C], patch_idxs is of shape [K*N]
+        patch_points, patch_idxs = self._sliding_patch_generation(
+            point, num_points, block_size, sample_rate, use_normalized_coord)
+        feats_dim = patch_points.shape[1]
+        seg_logits = []  # save patch predictions
+
+        for batch_idx in range(0, patch_points.shape[0], batch_size):
+            batch_points = patch_points[batch_idx:batch_idx + batch_size]
+            batch_points = batch_points.view(-1, num_points, feats_dim)
+            # batch_seg_logit is of shape [B, num_classes, N]
+            batch_seg_logit = self.encode_decode(batch_points, img_meta)
+            batch_seg_logit = batch_seg_logit.transpose(1, 2).contiguous()
+            seg_logits.append(batch_seg_logit.view(-1, self.num_classes))
+
+        # aggregate per-point logits by indexing sum and dividing count
+        seg_logits = torch.cat(seg_logits, dim=0)  # [K*N, num_classes]
+        expand_patch_idxs = patch_idxs.unsqueeze(1).repeat(1, self.num_classes)
+        preds = point.new_zeros((point.shape[0], self.num_classes)).\
+            scatter_add_(dim=0, index=expand_patch_idxs, src=seg_logits)
+        count_mat = torch.bincount(patch_idxs)
+        preds = preds / count_mat[:, None]
+
+        # TODO: if rescale and voxelization segmentor
+
+        return preds.transpose(0, 1)  # to [num_classes, K*N]
+
+    def whole_inference(self, points, img_metas, rescale):
+        """Inference with full scene (one forward pass without sliding)."""
+        seg_logit = self.encode_decode(points, img_metas)
+        # TODO: if rescale and voxelization segmentor
+        return seg_logit
+
+    def inference(self, points, img_metas, rescale):
+        """Inference with slide/whole style.
+
+        Args:
+            points (torch.Tensor): Input points of shape [B, N, 3+C].
+            img_metas (list[dict]): Meta information of each sample.
+            rescale (bool): Whether transform to original number of points.
+                Will be used for voxelization based segmentors.
+
+        Returns:
+            Tensor: The output segmentation map.
+        """
+        assert self.test_cfg.mode in ['slide', 'whole']
+        if self.test_cfg.mode == 'slide':
+            seg_logit = torch.stack([
+                self.slide_inference(point, img_meta, rescale)
+                for point, img_meta in zip(points, img_metas)
+            ], 0)
+        else:
+            seg_logit = self.whole_inference(points, img_metas, rescale)
+        output = F.softmax(seg_logit, dim=1)
+        return output
+
+    def simple_test(self, points, img_metas, rescale=True):
+        """Simple test with single scene.
+
+        Args:
+            points (list[torch.Tensor]): List of points of shape [N, 3+C].
+            img_metas (list[dict]): Meta information of each sample.
+            rescale (bool): Whether transform to original number of points.
+                Will be used for voxelization based segmentors.
+                Defaults to True.
+
+        Returns:
+            list[dict]: The output prediction result with following keys:
+
+                - semantic_mask (Tensor): Segmentation mask of shape [N].
+        """
+        # 3D segmentation requires per-point prediction, so it's impossible
+        # to use down-sampling to get a batch of scenes with same num_points
+        # therefore, we only support testing one scene every time
+        seg_pred = []
+        for point, img_meta in zip(points, img_metas):
+            seg_prob = self.inference(point.unsqueeze(0), [img_meta],
+                                      rescale)[0]
+            seg_map = seg_prob.argmax(0)  # [N]
+            # to cpu tensor for consistency with det3d
+            seg_map = seg_map.cpu()
+            seg_pred.append(seg_map)
+        # warp in dict
+        seg_pred = [dict(semantic_mask=seg_map) for seg_map in seg_pred]
+        return seg_pred
+
+    def aug_test(self, points, img_metas, rescale=True):
+        """Test with augmentations.
+
+        Args:
+            points (list[torch.Tensor]): List of points of shape [B, N, 3+C].
+            img_metas (list[list[dict]]): Meta information of each sample.
+                Outer list are different samples while inner is different augs.
+            rescale (bool): Whether transform to original number of points.
+                Will be used for voxelization based segmentors.
+                Defaults to True.
+
+        Returns:
+            list[dict]: The output prediction result with following keys:
+
+                - semantic_mask (Tensor): Segmentation mask of shape [N].
+        """
+        # in aug_test, one scene going through different augmentations could
+        # have the same number of points and are stacked as a batch
+        # to save memory, we get augmented seg logit inplace
+        seg_pred = []
+        for point, img_meta in zip(points, img_metas):
+            seg_prob = self.inference(point, img_meta, rescale)
+            seg_prob = seg_prob.mean(0)  # [num_classes, N]
+            seg_map = seg_prob.argmax(0)  # [N]
+            # to cpu tensor for consistency with det3d
+            seg_map = seg_map.cpu()
+            seg_pred.append(seg_map)
+        # warp in dict
+        seg_pred = [dict(semantic_mask=seg_map) for seg_map in seg_pred]
+        return seg_pred
diff --git a/mmdet3d/models/utils/__init__.py b/mmdet3d/models/utils/__init__.py
index 92a0499..41451f1 100644
--- a/mmdet3d/models/utils/__init__.py
+++ b/mmdet3d/models/utils/__init__.py
@@ -1,11 +1,11 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .clip_sigmoid import clip_sigmoid
-from .edge_indices import get_edge_indices
-from .gen_keypoints import get_keypoints
-from .handle_objs import filter_outside_objs, handle_proj_objs
-from .mlp import MLP
-
-__all__ = [
-    'clip_sigmoid', 'MLP', 'get_edge_indices', 'filter_outside_objs',
-    'handle_proj_objs', 'get_keypoints'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .clip_sigmoid import clip_sigmoid
+from .edge_indices import get_edge_indices
+from .gen_keypoints import get_keypoints
+from .handle_objs import filter_outside_objs, handle_proj_objs
+from .mlp import MLP
+
+__all__ = [
+    'clip_sigmoid', 'MLP', 'get_edge_indices', 'filter_outside_objs',
+    'handle_proj_objs', 'get_keypoints'
+]
diff --git a/mmdet3d/models/utils/clip_sigmoid.py b/mmdet3d/models/utils/clip_sigmoid.py
index 3afd4ed..2147afe 100644
--- a/mmdet3d/models/utils/clip_sigmoid.py
+++ b/mmdet3d/models/utils/clip_sigmoid.py
@@ -1,17 +1,17 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-
-def clip_sigmoid(x, eps=1e-4):
-    """Sigmoid function for input feature.
-
-    Args:
-        x (torch.Tensor): Input feature map with the shape of [B, N, H, W].
-        eps (float, optional): Lower bound of the range to be clamped to.
-            Defaults to 1e-4.
-
-    Returns:
-        torch.Tensor: Feature map after sigmoid.
-    """
-    y = torch.clamp(x.sigmoid_(), min=eps, max=1 - eps)
-    return y
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def clip_sigmoid(x, eps=1e-4):
+    """Sigmoid function for input feature.
+
+    Args:
+        x (torch.Tensor): Input feature map with the shape of [B, N, H, W].
+        eps (float, optional): Lower bound of the range to be clamped to.
+            Defaults to 1e-4.
+
+    Returns:
+        torch.Tensor: Feature map after sigmoid.
+    """
+    y = torch.clamp(x.sigmoid_(), min=eps, max=1 - eps)
+    return y
diff --git a/mmdet3d/models/utils/edge_indices.py b/mmdet3d/models/utils/edge_indices.py
index 5dcb71f..d24ea38 100644
--- a/mmdet3d/models/utils/edge_indices.py
+++ b/mmdet3d/models/utils/edge_indices.py
@@ -1,88 +1,88 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import torch
-
-
-def get_edge_indices(img_metas,
-                     downsample_ratio,
-                     step=1,
-                     pad_mode='default',
-                     dtype=np.float32,
-                     device='cpu'):
-    """Function to filter the objects label outside the image.
-    The edge_indices are generated using numpy on cpu rather
-    than on CUDA due to the latency issue. When batch size = 8,
-    this function with numpy array is ~8 times faster than that
-    with CUDA tensor (0.09s and 0.72s in 100 runs).
-
-    Args:
-        img_metas (list[dict]): Meta information of each image, e.g.,
-            image size, scaling factor, etc.
-        downsample_ratio (int): Downsample ratio of output feature,
-        step (int, optional): Step size used for generateing
-            edge indices. Default: 1.
-        pad_mode (str, optional): Padding mode during data pipeline.
-            Default: 'default'.
-        dtype (torch.dtype, optional): Dtype of edge indices tensor.
-            Default: np.float32.
-        device (str, optional): Device of edge indices tensor.
-            Default: 'cpu'.
-
-    Returns:
-        list[Tensor]: Edge indices for each image in batch data.
-    """
-    edge_indices_list = []
-    for i in range(len(img_metas)):
-        img_shape = img_metas[i]['img_shape']
-        pad_shape = img_metas[i]['pad_shape']
-        h, w = img_shape[:2]
-        pad_h, pad_w = pad_shape
-        edge_indices = []
-
-        if pad_mode == 'default':
-            x_min = 0
-            y_min = 0
-            x_max = (w - 1) // downsample_ratio
-            y_max = (h - 1) // downsample_ratio
-        elif pad_mode == 'center':
-            x_min = np.ceil((pad_w - w) / 2 * downsample_ratio)
-            y_min = np.ceil((pad_h - h) / 2 * downsample_ratio)
-            x_max = x_min + w // downsample_ratio
-            y_max = y_min + h // downsample_ratio
-        else:
-            raise NotImplementedError
-
-        # left
-        y = np.arange(y_min, y_max, step, dtype=dtype)
-        x = np.ones(len(y)) * x_min
-
-        edge_indices_edge = np.stack((x, y), axis=1)
-        edge_indices.append(edge_indices_edge)
-
-        # bottom
-        x = np.arange(x_min, x_max, step, dtype=dtype)
-        y = np.ones(len(x)) * y_max
-
-        edge_indices_edge = np.stack((x, y), axis=1)
-        edge_indices.append(edge_indices_edge)
-
-        # right
-        y = np.arange(y_max, y_min, -step, dtype=dtype)
-        x = np.ones(len(y)) * x_max
-
-        edge_indices_edge = np.stack((x, y), axis=1)
-        edge_indices.append(edge_indices_edge)
-
-        # top
-        x = np.arange(x_max, x_min, -step, dtype=dtype)
-        y = np.ones(len(x)) * y_min
-
-        edge_indices_edge = np.stack((x, y), axis=1)
-        edge_indices.append(edge_indices_edge)
-
-        edge_indices = \
-            np.concatenate([index for index in edge_indices], axis=0)
-        edge_indices = torch.from_numpy(edge_indices).to(device).long()
-        edge_indices_list.append(edge_indices)
-
-    return edge_indices_list
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def get_edge_indices(img_metas,
+                     downsample_ratio,
+                     step=1,
+                     pad_mode='default',
+                     dtype=np.float32,
+                     device='cpu'):
+    """Function to filter the objects label outside the image.
+    The edge_indices are generated using numpy on cpu rather
+    than on CUDA due to the latency issue. When batch size = 8,
+    this function with numpy array is ~8 times faster than that
+    with CUDA tensor (0.09s and 0.72s in 100 runs).
+
+    Args:
+        img_metas (list[dict]): Meta information of each image, e.g.,
+            image size, scaling factor, etc.
+        downsample_ratio (int): Downsample ratio of output feature,
+        step (int, optional): Step size used for generateing
+            edge indices. Default: 1.
+        pad_mode (str, optional): Padding mode during data pipeline.
+            Default: 'default'.
+        dtype (torch.dtype, optional): Dtype of edge indices tensor.
+            Default: np.float32.
+        device (str, optional): Device of edge indices tensor.
+            Default: 'cpu'.
+
+    Returns:
+        list[Tensor]: Edge indices for each image in batch data.
+    """
+    edge_indices_list = []
+    for i in range(len(img_metas)):
+        img_shape = img_metas[i]['img_shape']
+        pad_shape = img_metas[i]['pad_shape']
+        h, w = img_shape[:2]
+        pad_h, pad_w = pad_shape
+        edge_indices = []
+
+        if pad_mode == 'default':
+            x_min = 0
+            y_min = 0
+            x_max = (w - 1) // downsample_ratio
+            y_max = (h - 1) // downsample_ratio
+        elif pad_mode == 'center':
+            x_min = np.ceil((pad_w - w) / 2 * downsample_ratio)
+            y_min = np.ceil((pad_h - h) / 2 * downsample_ratio)
+            x_max = x_min + w // downsample_ratio
+            y_max = y_min + h // downsample_ratio
+        else:
+            raise NotImplementedError
+
+        # left
+        y = np.arange(y_min, y_max, step, dtype=dtype)
+        x = np.ones(len(y)) * x_min
+
+        edge_indices_edge = np.stack((x, y), axis=1)
+        edge_indices.append(edge_indices_edge)
+
+        # bottom
+        x = np.arange(x_min, x_max, step, dtype=dtype)
+        y = np.ones(len(x)) * y_max
+
+        edge_indices_edge = np.stack((x, y), axis=1)
+        edge_indices.append(edge_indices_edge)
+
+        # right
+        y = np.arange(y_max, y_min, -step, dtype=dtype)
+        x = np.ones(len(y)) * x_max
+
+        edge_indices_edge = np.stack((x, y), axis=1)
+        edge_indices.append(edge_indices_edge)
+
+        # top
+        x = np.arange(x_max, x_min, -step, dtype=dtype)
+        y = np.ones(len(x)) * y_min
+
+        edge_indices_edge = np.stack((x, y), axis=1)
+        edge_indices.append(edge_indices_edge)
+
+        edge_indices = \
+            np.concatenate([index for index in edge_indices], axis=0)
+        edge_indices = torch.from_numpy(edge_indices).to(device).long()
+        edge_indices_list.append(edge_indices)
+
+    return edge_indices_list
diff --git a/mmdet3d/models/utils/gen_keypoints.py b/mmdet3d/models/utils/gen_keypoints.py
index 8c7909b..94b8603 100644
--- a/mmdet3d/models/utils/gen_keypoints.py
+++ b/mmdet3d/models/utils/gen_keypoints.py
@@ -1,80 +1,80 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmdet3d.core.bbox import points_cam2img
-
-
-def get_keypoints(gt_bboxes_3d_list,
-                  centers2d_list,
-                  img_metas,
-                  use_local_coords=True):
-    """Function to filter the objects label outside the image.
-
-    Args:
-        gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
-            shape (num_gt, 4).
-        centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
-            shape (num_gt, 2).
-        img_metas (list[dict]): Meta information of each image, e.g.,
-            image size, scaling factor, etc.
-        use_local_coords (bool, optional): Wheher to use local coordinates
-            for keypoints. Default: True.
-
-    Returns:
-        tuple[list[Tensor]]: It contains two elements, the first is the
-        keypoints for each projected 2D bbox in batch data. The second is
-        the visible mask of depth calculated by keypoints.
-    """
-
-    assert len(gt_bboxes_3d_list) == len(centers2d_list)
-    bs = len(gt_bboxes_3d_list)
-    keypoints2d_list = []
-    keypoints_depth_mask_list = []
-
-    for i in range(bs):
-        gt_bboxes_3d = gt_bboxes_3d_list[i]
-        centers2d = centers2d_list[i]
-        img_shape = img_metas[i]['img_shape']
-        cam2img = img_metas[i]['cam2img']
-        h, w = img_shape[:2]
-        # (N, 8, 3)
-        corners3d = gt_bboxes_3d.corners
-        top_centers3d = torch.mean(corners3d[:, [0, 1, 4, 5], :], dim=1)
-        bot_centers3d = torch.mean(corners3d[:, [2, 3, 6, 7], :], dim=1)
-        # (N, 2, 3)
-        top_bot_centers3d = torch.stack((top_centers3d, bot_centers3d), dim=1)
-        keypoints3d = torch.cat((corners3d, top_bot_centers3d), dim=1)
-        # (N, 10, 2)
-        keypoints2d = points_cam2img(keypoints3d, cam2img)
-
-        # keypoints mask: keypoints must be inside
-        # the image and in front of the camera
-        keypoints_x_visible = (keypoints2d[..., 0] >= 0) & (
-            keypoints2d[..., 0] <= w - 1)
-        keypoints_y_visible = (keypoints2d[..., 1] >= 0) & (
-            keypoints2d[..., 1] <= h - 1)
-        keypoints_z_visible = (keypoints3d[..., -1] > 0)
-
-        # (N, 1O)
-        keypoints_visible = keypoints_x_visible & \
-            keypoints_y_visible & keypoints_z_visible
-        # center, diag-02, diag-13
-        keypoints_depth_valid = torch.stack(
-            (keypoints_visible[:, [8, 9]].all(dim=1),
-             keypoints_visible[:, [0, 3, 5, 6]].all(dim=1),
-             keypoints_visible[:, [1, 2, 4, 7]].all(dim=1)),
-            dim=1)
-        keypoints_visible = keypoints_visible.float()
-
-        if use_local_coords:
-            keypoints2d = torch.cat((keypoints2d - centers2d.unsqueeze(1),
-                                     keypoints_visible.unsqueeze(-1)),
-                                    dim=2)
-        else:
-            keypoints2d = torch.cat(
-                (keypoints2d, keypoints_visible.unsqueeze(-1)), dim=2)
-
-        keypoints2d_list.append(keypoints2d)
-        keypoints_depth_mask_list.append(keypoints_depth_valid)
-
-    return (keypoints2d_list, keypoints_depth_mask_list)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmdet3d.core.bbox import points_cam2img
+
+
+def get_keypoints(gt_bboxes_3d_list,
+                  centers2d_list,
+                  img_metas,
+                  use_local_coords=True):
+    """Function to filter the objects label outside the image.
+
+    Args:
+        gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+            shape (num_gt, 4).
+        centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
+            shape (num_gt, 2).
+        img_metas (list[dict]): Meta information of each image, e.g.,
+            image size, scaling factor, etc.
+        use_local_coords (bool, optional): Wheher to use local coordinates
+            for keypoints. Default: True.
+
+    Returns:
+        tuple[list[Tensor]]: It contains two elements, the first is the
+        keypoints for each projected 2D bbox in batch data. The second is
+        the visible mask of depth calculated by keypoints.
+    """
+
+    assert len(gt_bboxes_3d_list) == len(centers2d_list)
+    bs = len(gt_bboxes_3d_list)
+    keypoints2d_list = []
+    keypoints_depth_mask_list = []
+
+    for i in range(bs):
+        gt_bboxes_3d = gt_bboxes_3d_list[i]
+        centers2d = centers2d_list[i]
+        img_shape = img_metas[i]['img_shape']
+        cam2img = img_metas[i]['cam2img']
+        h, w = img_shape[:2]
+        # (N, 8, 3)
+        corners3d = gt_bboxes_3d.corners
+        top_centers3d = torch.mean(corners3d[:, [0, 1, 4, 5], :], dim=1)
+        bot_centers3d = torch.mean(corners3d[:, [2, 3, 6, 7], :], dim=1)
+        # (N, 2, 3)
+        top_bot_centers3d = torch.stack((top_centers3d, bot_centers3d), dim=1)
+        keypoints3d = torch.cat((corners3d, top_bot_centers3d), dim=1)
+        # (N, 10, 2)
+        keypoints2d = points_cam2img(keypoints3d, cam2img)
+
+        # keypoints mask: keypoints must be inside
+        # the image and in front of the camera
+        keypoints_x_visible = (keypoints2d[..., 0] >= 0) & (
+            keypoints2d[..., 0] <= w - 1)
+        keypoints_y_visible = (keypoints2d[..., 1] >= 0) & (
+            keypoints2d[..., 1] <= h - 1)
+        keypoints_z_visible = (keypoints3d[..., -1] > 0)
+
+        # (N, 1O)
+        keypoints_visible = keypoints_x_visible & \
+            keypoints_y_visible & keypoints_z_visible
+        # center, diag-02, diag-13
+        keypoints_depth_valid = torch.stack(
+            (keypoints_visible[:, [8, 9]].all(dim=1),
+             keypoints_visible[:, [0, 3, 5, 6]].all(dim=1),
+             keypoints_visible[:, [1, 2, 4, 7]].all(dim=1)),
+            dim=1)
+        keypoints_visible = keypoints_visible.float()
+
+        if use_local_coords:
+            keypoints2d = torch.cat((keypoints2d - centers2d.unsqueeze(1),
+                                     keypoints_visible.unsqueeze(-1)),
+                                    dim=2)
+        else:
+            keypoints2d = torch.cat(
+                (keypoints2d, keypoints_visible.unsqueeze(-1)), dim=2)
+
+        keypoints2d_list.append(keypoints2d)
+        keypoints_depth_mask_list.append(keypoints_depth_valid)
+
+    return (keypoints2d_list, keypoints_depth_mask_list)
diff --git a/mmdet3d/models/utils/handle_objs.py b/mmdet3d/models/utils/handle_objs.py
index 25fd793..1a5a451 100644
--- a/mmdet3d/models/utils/handle_objs.py
+++ b/mmdet3d/models/utils/handle_objs.py
@@ -1,135 +1,135 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-
-def filter_outside_objs(gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list,
-                        gt_labels_3d_list, centers2d_list, img_metas):
-    """Function to filter the objects label outside the image.
-
-    Args:
-        gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
-            each has shape (num_gt, 4).
-        gt_labels_list (list[Tensor]): Ground truth labels of each box,
-            each has shape (num_gt,).
-        gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
-            image, each has shape (num_gt, bbox_code_size).
-        gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
-            box, each has shape (num_gt,).
-        centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
-            each has shape (num_gt, 2).
-        img_metas (list[dict]): Meta information of each image, e.g.,
-            image size, scaling factor, etc.
-    """
-    bs = len(centers2d_list)
-
-    for i in range(bs):
-        centers2d = centers2d_list[i].clone()
-        img_shape = img_metas[i]['img_shape']
-        keep_inds = (centers2d[:, 0] > 0) & \
-            (centers2d[:, 0] < img_shape[1]) & \
-            (centers2d[:, 1] > 0) & \
-            (centers2d[:, 1] < img_shape[0])
-        centers2d_list[i] = centers2d[keep_inds]
-        gt_labels_list[i] = gt_labels_list[i][keep_inds]
-        gt_bboxes_list[i] = gt_bboxes_list[i][keep_inds]
-        gt_bboxes_3d_list[i].tensor = gt_bboxes_3d_list[i].tensor[keep_inds]
-        gt_labels_3d_list[i] = gt_labels_3d_list[i][keep_inds]
-
-
-def get_centers2d_target(centers2d, centers, img_shape):
-    """Function to get target centers2d.
-
-    Args:
-        centers2d (Tensor): Projected 3D centers onto 2D images.
-        centers (Tensor): Centers of 2d gt bboxes.
-        img_shape (tuple): Resized image shape.
-
-    Returns:
-        torch.Tensor: Projected 3D centers (centers2D) target.
-    """
-    N = centers2d.shape[0]
-    h, w = img_shape[:2]
-    valid_intersects = centers2d.new_zeros((N, 2))
-    a = (centers[:, 1] - centers2d[:, 1]) / (centers[:, 0] - centers2d[:, 0])
-    b = centers[:, 1] - a * centers[:, 0]
-    left_y = b
-    right_y = (w - 1) * a + b
-    top_x = -b / a
-    bottom_x = (h - 1 - b) / a
-
-    left_coors = torch.stack((left_y.new_zeros(N, ), left_y), dim=1)
-    right_coors = torch.stack((right_y.new_full((N, ), w - 1), right_y), dim=1)
-    top_coors = torch.stack((top_x, top_x.new_zeros(N, )), dim=1)
-    bottom_coors = torch.stack((bottom_x, bottom_x.new_full((N, ), h - 1)),
-                               dim=1)
-
-    intersects = torch.stack(
-        [left_coors, right_coors, top_coors, bottom_coors], dim=1)
-    intersects_x = intersects[:, :, 0]
-    intersects_y = intersects[:, :, 1]
-    inds = (intersects_x >= 0) & (intersects_x <=
-                                  w - 1) & (intersects_y >= 0) & (
-                                      intersects_y <= h - 1)
-    valid_intersects = intersects[inds].reshape(N, 2, 2)
-    dist = torch.norm(valid_intersects - centers2d.unsqueeze(1), dim=2)
-    min_idx = torch.argmin(dist, dim=1)
-
-    min_idx = min_idx.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, 2)
-    centers2d_target = valid_intersects.gather(dim=1, index=min_idx).squeeze(1)
-
-    return centers2d_target
-
-
-def handle_proj_objs(centers2d_list, gt_bboxes_list, img_metas):
-    """Function to handle projected object centers2d, generate target
-    centers2d.
-
-    Args:
-        gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
-            shape (num_gt, 4).
-        centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
-            shape (num_gt, 2).
-        img_metas (list[dict]): Meta information of each image, e.g.,
-            image size, scaling factor, etc.
-
-    Returns:
-        tuple[list[Tensor]]: It contains three elements. The first is the
-        target centers2d after handling the truncated objects. The second
-        is the offsets between target centers2d and round int dtype
-        centers2d,and the last is the truncation mask for each object in
-        batch data.
-    """
-    bs = len(centers2d_list)
-    centers2d_target_list = []
-    trunc_mask_list = []
-    offsets2d_list = []
-    # for now, only pad mode that img is padded by right and
-    # bottom side is supported.
-    for i in range(bs):
-        centers2d = centers2d_list[i]
-        gt_bbox = gt_bboxes_list[i]
-        img_shape = img_metas[i]['img_shape']
-        centers2d_target = centers2d.clone()
-        inside_inds = (centers2d[:, 0] > 0) & \
-            (centers2d[:, 0] < img_shape[1]) & \
-            (centers2d[:, 1] > 0) & \
-            (centers2d[:, 1] < img_shape[0])
-        outside_inds = ~inside_inds
-
-        # if there are outside objects
-        if outside_inds.any():
-            centers = (gt_bbox[:, :2] + gt_bbox[:, 2:]) / 2
-            outside_centers2d = centers2d[outside_inds]
-            match_centers = centers[outside_inds]
-            target_outside_centers2d = get_centers2d_target(
-                outside_centers2d, match_centers, img_shape)
-            centers2d_target[outside_inds] = target_outside_centers2d
-
-        offsets2d = centers2d - centers2d_target.round().int()
-        trunc_mask = outside_inds
-
-        centers2d_target_list.append(centers2d_target)
-        trunc_mask_list.append(trunc_mask)
-        offsets2d_list.append(offsets2d)
-
-    return (centers2d_target_list, offsets2d_list, trunc_mask_list)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def filter_outside_objs(gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list,
+                        gt_labels_3d_list, centers2d_list, img_metas):
+    """Function to filter the objects label outside the image.
+
+    Args:
+        gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+            each has shape (num_gt, 4).
+        gt_labels_list (list[Tensor]): Ground truth labels of each box,
+            each has shape (num_gt,).
+        gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each
+            image, each has shape (num_gt, bbox_code_size).
+        gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each
+            box, each has shape (num_gt,).
+        centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
+            each has shape (num_gt, 2).
+        img_metas (list[dict]): Meta information of each image, e.g.,
+            image size, scaling factor, etc.
+    """
+    bs = len(centers2d_list)
+
+    for i in range(bs):
+        centers2d = centers2d_list[i].clone()
+        img_shape = img_metas[i]['img_shape']
+        keep_inds = (centers2d[:, 0] > 0) & \
+            (centers2d[:, 0] < img_shape[1]) & \
+            (centers2d[:, 1] > 0) & \
+            (centers2d[:, 1] < img_shape[0])
+        centers2d_list[i] = centers2d[keep_inds]
+        gt_labels_list[i] = gt_labels_list[i][keep_inds]
+        gt_bboxes_list[i] = gt_bboxes_list[i][keep_inds]
+        gt_bboxes_3d_list[i].tensor = gt_bboxes_3d_list[i].tensor[keep_inds]
+        gt_labels_3d_list[i] = gt_labels_3d_list[i][keep_inds]
+
+
+def get_centers2d_target(centers2d, centers, img_shape):
+    """Function to get target centers2d.
+
+    Args:
+        centers2d (Tensor): Projected 3D centers onto 2D images.
+        centers (Tensor): Centers of 2d gt bboxes.
+        img_shape (tuple): Resized image shape.
+
+    Returns:
+        torch.Tensor: Projected 3D centers (centers2D) target.
+    """
+    N = centers2d.shape[0]
+    h, w = img_shape[:2]
+    valid_intersects = centers2d.new_zeros((N, 2))
+    a = (centers[:, 1] - centers2d[:, 1]) / (centers[:, 0] - centers2d[:, 0])
+    b = centers[:, 1] - a * centers[:, 0]
+    left_y = b
+    right_y = (w - 1) * a + b
+    top_x = -b / a
+    bottom_x = (h - 1 - b) / a
+
+    left_coors = torch.stack((left_y.new_zeros(N, ), left_y), dim=1)
+    right_coors = torch.stack((right_y.new_full((N, ), w - 1), right_y), dim=1)
+    top_coors = torch.stack((top_x, top_x.new_zeros(N, )), dim=1)
+    bottom_coors = torch.stack((bottom_x, bottom_x.new_full((N, ), h - 1)),
+                               dim=1)
+
+    intersects = torch.stack(
+        [left_coors, right_coors, top_coors, bottom_coors], dim=1)
+    intersects_x = intersects[:, :, 0]
+    intersects_y = intersects[:, :, 1]
+    inds = (intersects_x >= 0) & (intersects_x <=
+                                  w - 1) & (intersects_y >= 0) & (
+                                      intersects_y <= h - 1)
+    valid_intersects = intersects[inds].reshape(N, 2, 2)
+    dist = torch.norm(valid_intersects - centers2d.unsqueeze(1), dim=2)
+    min_idx = torch.argmin(dist, dim=1)
+
+    min_idx = min_idx.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, 2)
+    centers2d_target = valid_intersects.gather(dim=1, index=min_idx).squeeze(1)
+
+    return centers2d_target
+
+
+def handle_proj_objs(centers2d_list, gt_bboxes_list, img_metas):
+    """Function to handle projected object centers2d, generate target
+    centers2d.
+
+    Args:
+        gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+            shape (num_gt, 4).
+        centers2d_list (list[Tensor]): Projected 3D centers onto 2D image,
+            shape (num_gt, 2).
+        img_metas (list[dict]): Meta information of each image, e.g.,
+            image size, scaling factor, etc.
+
+    Returns:
+        tuple[list[Tensor]]: It contains three elements. The first is the
+        target centers2d after handling the truncated objects. The second
+        is the offsets between target centers2d and round int dtype
+        centers2d,and the last is the truncation mask for each object in
+        batch data.
+    """
+    bs = len(centers2d_list)
+    centers2d_target_list = []
+    trunc_mask_list = []
+    offsets2d_list = []
+    # for now, only pad mode that img is padded by right and
+    # bottom side is supported.
+    for i in range(bs):
+        centers2d = centers2d_list[i]
+        gt_bbox = gt_bboxes_list[i]
+        img_shape = img_metas[i]['img_shape']
+        centers2d_target = centers2d.clone()
+        inside_inds = (centers2d[:, 0] > 0) & \
+            (centers2d[:, 0] < img_shape[1]) & \
+            (centers2d[:, 1] > 0) & \
+            (centers2d[:, 1] < img_shape[0])
+        outside_inds = ~inside_inds
+
+        # if there are outside objects
+        if outside_inds.any():
+            centers = (gt_bbox[:, :2] + gt_bbox[:, 2:]) / 2
+            outside_centers2d = centers2d[outside_inds]
+            match_centers = centers[outside_inds]
+            target_outside_centers2d = get_centers2d_target(
+                outside_centers2d, match_centers, img_shape)
+            centers2d_target[outside_inds] = target_outside_centers2d
+
+        offsets2d = centers2d - centers2d_target.round().int()
+        trunc_mask = outside_inds
+
+        centers2d_target_list.append(centers2d_target)
+        trunc_mask_list.append(trunc_mask)
+        offsets2d_list.append(offsets2d)
+
+    return (centers2d_target_list, offsets2d_list, trunc_mask_list)
diff --git a/mmdet3d/models/utils/mlp.py b/mmdet3d/models/utils/mlp.py
index 0b499bb..5b06b36 100644
--- a/mmdet3d/models/utils/mlp.py
+++ b/mmdet3d/models/utils/mlp.py
@@ -1,51 +1,51 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule
-from torch import nn as nn
-
-
-class MLP(BaseModule):
-    """A simple MLP module.
-
-    Pass features (B, C, N) through an MLP.
-
-    Args:
-        in_channels (int, optional): Number of channels of input features.
-            Default: 18.
-        conv_channels (tuple[int], optional): Out channels of the convolution.
-            Default: (256, 256).
-        conv_cfg (dict, optional): Config of convolution.
-            Default: dict(type='Conv1d').
-        norm_cfg (dict, optional): Config of normalization.
-            Default: dict(type='BN1d').
-        act_cfg (dict, optional): Config of activation.
-            Default: dict(type='ReLU').
-    """
-
-    def __init__(self,
-                 in_channel=18,
-                 conv_channels=(256, 256),
-                 conv_cfg=dict(type='Conv1d'),
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.mlp = nn.Sequential()
-        prev_channels = in_channel
-        for i, conv_channel in enumerate(conv_channels):
-            self.mlp.add_module(
-                f'layer{i}',
-                ConvModule(
-                    prev_channels,
-                    conv_channels[i],
-                    1,
-                    padding=0,
-                    conv_cfg=conv_cfg,
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg,
-                    bias=True,
-                    inplace=True))
-            prev_channels = conv_channels[i]
-
-    def forward(self, img_features):
-        return self.mlp(img_features)
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+from torch import nn as nn
+
+
+class MLP(BaseModule):
+    """A simple MLP module.
+
+    Pass features (B, C, N) through an MLP.
+
+    Args:
+        in_channels (int, optional): Number of channels of input features.
+            Default: 18.
+        conv_channels (tuple[int], optional): Out channels of the convolution.
+            Default: (256, 256).
+        conv_cfg (dict, optional): Config of convolution.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict, optional): Config of normalization.
+            Default: dict(type='BN1d').
+        act_cfg (dict, optional): Config of activation.
+            Default: dict(type='ReLU').
+    """
+
+    def __init__(self,
+                 in_channel=18,
+                 conv_channels=(256, 256),
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.mlp = nn.Sequential()
+        prev_channels = in_channel
+        for i, conv_channel in enumerate(conv_channels):
+            self.mlp.add_module(
+                f'layer{i}',
+                ConvModule(
+                    prev_channels,
+                    conv_channels[i],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    bias=True,
+                    inplace=True))
+            prev_channels = conv_channels[i]
+
+    def forward(self, img_features):
+        return self.mlp(img_features)
diff --git a/mmdet3d/models/voxel_encoders/__init__.py b/mmdet3d/models/voxel_encoders/__init__.py
index 2926a83..9e5ac00 100644
--- a/mmdet3d/models/voxel_encoders/__init__.py
+++ b/mmdet3d/models/voxel_encoders/__init__.py
@@ -1,8 +1,8 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .pillar_encoder import DynamicPillarFeatureNet, PillarFeatureNet
-from .voxel_encoder import DynamicSimpleVFE, DynamicVFE, HardSimpleVFE, HardVFE
-
-__all__ = [
-    'PillarFeatureNet', 'DynamicPillarFeatureNet', 'HardVFE', 'DynamicVFE',
-    'HardSimpleVFE', 'DynamicSimpleVFE'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .pillar_encoder import DynamicPillarFeatureNet, PillarFeatureNet
+from .voxel_encoder import DynamicSimpleVFE, DynamicVFE, HardSimpleVFE, HardVFE
+
+__all__ = [
+    'PillarFeatureNet', 'DynamicPillarFeatureNet', 'HardVFE', 'DynamicVFE',
+    'HardSimpleVFE', 'DynamicSimpleVFE'
+]
diff --git a/mmdet3d/models/voxel_encoders/pillar_encoder.py b/mmdet3d/models/voxel_encoders/pillar_encoder.py
index 39bdc72..a0a19d6 100644
--- a/mmdet3d/models/voxel_encoders/pillar_encoder.py
+++ b/mmdet3d/models/voxel_encoders/pillar_encoder.py
@@ -1,323 +1,323 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import build_norm_layer
-from mmcv.ops import DynamicScatter
-from mmcv.runner import force_fp32
-from torch import nn
-
-from ..builder import VOXEL_ENCODERS
-from .utils import PFNLayer, get_paddings_indicator
-
-
-@VOXEL_ENCODERS.register_module()
-class PillarFeatureNet(nn.Module):
-    """Pillar Feature Net.
-
-    The network prepares the pillar features and performs forward pass
-    through PFNLayers.
-
-    Args:
-        in_channels (int, optional): Number of input features,
-            either x, y, z or x, y, z, r. Defaults to 4.
-        feat_channels (tuple, optional): Number of features in each of the
-            N PFNLayers. Defaults to (64, ).
-        with_distance (bool, optional): Whether to include Euclidean distance
-            to points. Defaults to False.
-        with_cluster_center (bool, optional): [description]. Defaults to True.
-        with_voxel_center (bool, optional): [description]. Defaults to True.
-        voxel_size (tuple[float], optional): Size of voxels, only utilize x
-            and y size. Defaults to (0.2, 0.2, 4).
-        point_cloud_range (tuple[float], optional): Point cloud range, only
-            utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1).
-        norm_cfg ([type], optional): [description].
-            Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
-        mode (str, optional): The mode to gather point features. Options are
-            'max' or 'avg'. Defaults to 'max'.
-        legacy (bool, optional): Whether to use the new behavior or
-            the original behavior. Defaults to True.
-    """
-
-    def __init__(self,
-                 in_channels=4,
-                 feat_channels=(64, ),
-                 with_distance=False,
-                 with_cluster_center=True,
-                 with_voxel_center=True,
-                 voxel_size=(0.2, 0.2, 4),
-                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 mode='max',
-                 legacy=True):
-        super(PillarFeatureNet, self).__init__()
-        assert len(feat_channels) > 0
-        self.legacy = legacy
-        if with_cluster_center:
-            in_channels += 3
-        if with_voxel_center:
-            in_channels += 3
-        if with_distance:
-            in_channels += 1
-        self._with_distance = with_distance
-        self._with_cluster_center = with_cluster_center
-        self._with_voxel_center = with_voxel_center
-        self.fp16_enabled = False
-        # Create PillarFeatureNet layers
-        self.in_channels = in_channels
-        feat_channels = [in_channels] + list(feat_channels)
-        pfn_layers = []
-        for i in range(len(feat_channels) - 1):
-            in_filters = feat_channels[i]
-            out_filters = feat_channels[i + 1]
-            if i < len(feat_channels) - 2:
-                last_layer = False
-            else:
-                last_layer = True
-            pfn_layers.append(
-                PFNLayer(
-                    in_filters,
-                    out_filters,
-                    norm_cfg=norm_cfg,
-                    last_layer=last_layer,
-                    mode=mode))
-        self.pfn_layers = nn.ModuleList(pfn_layers)
-
-        # Need pillar (voxel) size and x/y offset in order to calculate offset
-        self.vx = voxel_size[0]
-        self.vy = voxel_size[1]
-        self.vz = voxel_size[2]
-        self.x_offset = self.vx / 2 + point_cloud_range[0]
-        self.y_offset = self.vy / 2 + point_cloud_range[1]
-        self.z_offset = self.vz / 2 + point_cloud_range[2]
-        self.point_cloud_range = point_cloud_range
-
-    @force_fp32(out_fp16=True)
-    def forward(self, features, num_points, coors):
-        """Forward function.
-
-        Args:
-            features (torch.Tensor): Point features or raw points in shape
-                (N, M, C).
-            num_points (torch.Tensor): Number of points in each pillar.
-            coors (torch.Tensor): Coordinates of each voxel.
-
-        Returns:
-            torch.Tensor: Features of pillars.
-        """
-        features_ls = [features]
-        # Find distance of x, y, and z from cluster center
-        if self._with_cluster_center:
-            points_mean = features[:, :, :3].sum(
-                dim=1, keepdim=True) / num_points.type_as(features).view(
-                    -1, 1, 1)
-            f_cluster = features[:, :, :3] - points_mean
-            features_ls.append(f_cluster)
-
-        # Find distance of x, y, and z from pillar center
-        dtype = features.dtype
-        if self._with_voxel_center:
-            if not self.legacy:
-                f_center = torch.zeros_like(features[:, :, :3])
-                f_center[:, :, 0] = features[:, :, 0] - (
-                    coors[:, 3].to(dtype).unsqueeze(1) * self.vx +
-                    self.x_offset)
-                f_center[:, :, 1] = features[:, :, 1] - (
-                    coors[:, 2].to(dtype).unsqueeze(1) * self.vy +
-                    self.y_offset)
-                f_center[:, :, 2] = features[:, :, 2] - (
-                    coors[:, 1].to(dtype).unsqueeze(1) * self.vz +
-                    self.z_offset)
-            else:
-                f_center = features[:, :, :3]
-                f_center[:, :, 0] = f_center[:, :, 0] - (
-                    coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
-                    self.x_offset)
-                f_center[:, :, 1] = f_center[:, :, 1] - (
-                    coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
-                    self.y_offset)
-                f_center[:, :, 2] = f_center[:, :, 2] - (
-                    coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
-                    self.z_offset)
-            features_ls.append(f_center)
-
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
-            features_ls.append(points_dist)
-
-        # Combine together feature decorations
-        features = torch.cat(features_ls, dim=-1)
-        # The feature decorations were calculated without regard to whether
-        # pillar was empty. Need to ensure that
-        # empty pillars remain set to zeros.
-        voxel_count = features.shape[1]
-        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
-        mask = torch.unsqueeze(mask, -1).type_as(features)
-        features *= mask
-
-        for pfn in self.pfn_layers:
-            features = pfn(features, num_points)
-
-        return features.squeeze(1)
-
-
-@VOXEL_ENCODERS.register_module()
-class DynamicPillarFeatureNet(PillarFeatureNet):
-    """Pillar Feature Net using dynamic voxelization.
-
-    The network prepares the pillar features and performs forward pass
-    through PFNLayers. The main difference is that it is used for
-    dynamic voxels, which contains different number of points inside a voxel
-    without limits.
-
-    Args:
-        in_channels (int, optional): Number of input features,
-            either x, y, z or x, y, z, r. Defaults to 4.
-        feat_channels (tuple, optional): Number of features in each of the
-            N PFNLayers. Defaults to (64, ).
-        with_distance (bool, optional): Whether to include Euclidean distance
-            to points. Defaults to False.
-        with_cluster_center (bool, optional): [description]. Defaults to True.
-        with_voxel_center (bool, optional): [description]. Defaults to True.
-        voxel_size (tuple[float], optional): Size of voxels, only utilize x
-            and y size. Defaults to (0.2, 0.2, 4).
-        point_cloud_range (tuple[float], optional): Point cloud range, only
-            utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1).
-        norm_cfg ([type], optional): [description].
-            Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
-        mode (str, optional): The mode to gather point features. Options are
-            'max' or 'avg'. Defaults to 'max'.
-        legacy (bool, optional): Whether to use the new behavior or
-            the original behavior. Defaults to True.
-    """
-
-    def __init__(self,
-                 in_channels=4,
-                 feat_channels=(64, ),
-                 with_distance=False,
-                 with_cluster_center=True,
-                 with_voxel_center=True,
-                 voxel_size=(0.2, 0.2, 4),
-                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 mode='max',
-                 legacy=True):
-        super(DynamicPillarFeatureNet, self).__init__(
-            in_channels,
-            feat_channels,
-            with_distance,
-            with_cluster_center=with_cluster_center,
-            with_voxel_center=with_voxel_center,
-            voxel_size=voxel_size,
-            point_cloud_range=point_cloud_range,
-            norm_cfg=norm_cfg,
-            mode=mode,
-            legacy=legacy)
-        self.fp16_enabled = False
-        feat_channels = [self.in_channels] + list(feat_channels)
-        pfn_layers = []
-        # TODO: currently only support one PFNLayer
-
-        for i in range(len(feat_channels) - 1):
-            in_filters = feat_channels[i]
-            out_filters = feat_channels[i + 1]
-            if i > 0:
-                in_filters *= 2
-            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
-            pfn_layers.append(
-                nn.Sequential(
-                    nn.Linear(in_filters, out_filters, bias=False), norm_layer,
-                    nn.ReLU(inplace=True)))
-        self.num_pfn = len(pfn_layers)
-        self.pfn_layers = nn.ModuleList(pfn_layers)
-        self.pfn_scatter = DynamicScatter(voxel_size, point_cloud_range,
-                                          (mode != 'max'))
-        self.cluster_scatter = DynamicScatter(
-            voxel_size, point_cloud_range, average_points=True)
-
-    def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
-        """Map the centers of voxels to its corresponding points.
-
-        Args:
-            pts_coors (torch.Tensor): The coordinates of each points, shape
-                (M, 3), where M is the number of points.
-            voxel_mean (torch.Tensor): The mean or aggregated features of a
-                voxel, shape (N, C), where N is the number of voxels.
-            voxel_coors (torch.Tensor): The coordinates of each voxel.
-
-        Returns:
-            torch.Tensor: Corresponding voxel centers of each points, shape
-                (M, C), where M is the number of points.
-        """
-        # Step 1: scatter voxel into canvas
-        # Calculate necessary things for canvas creation
-        canvas_y = int(
-            (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
-        canvas_x = int(
-            (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
-        canvas_channel = voxel_mean.size(1)
-        batch_size = pts_coors[-1, 0] + 1
-        canvas_len = canvas_y * canvas_x * batch_size
-        # Create the canvas for this sample
-        canvas = voxel_mean.new_zeros(canvas_channel, canvas_len)
-        # Only include non-empty pillars
-        indices = (
-            voxel_coors[:, 0] * canvas_y * canvas_x +
-            voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
-        # Scatter the blob back to the canvas
-        canvas[:, indices.long()] = voxel_mean.t()
-
-        # Step 2: get voxel mean for each point
-        voxel_index = (
-            pts_coors[:, 0] * canvas_y * canvas_x +
-            pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
-        center_per_point = canvas[:, voxel_index.long()].t()
-        return center_per_point
-
-    @force_fp32(out_fp16=True)
-    def forward(self, features, coors):
-        """Forward function.
-
-        Args:
-            features (torch.Tensor): Point features or raw points in shape
-                (N, M, C).
-            coors (torch.Tensor): Coordinates of each voxel
-
-        Returns:
-            torch.Tensor: Features of pillars.
-        """
-        features_ls = [features]
-        # Find distance of x, y, and z from cluster center
-        if self._with_cluster_center:
-            voxel_mean, mean_coors = self.cluster_scatter(features, coors)
-            points_mean = self.map_voxel_center_to_point(
-                coors, voxel_mean, mean_coors)
-            # TODO: maybe also do cluster for reflectivity
-            f_cluster = features[:, :3] - points_mean[:, :3]
-            features_ls.append(f_cluster)
-
-        # Find distance of x, y, and z from pillar center
-        if self._with_voxel_center:
-            f_center = features.new_zeros(size=(features.size(0), 3))
-            f_center[:, 0] = features[:, 0] - (
-                coors[:, 3].type_as(features) * self.vx + self.x_offset)
-            f_center[:, 1] = features[:, 1] - (
-                coors[:, 2].type_as(features) * self.vy + self.y_offset)
-            f_center[:, 2] = features[:, 2] - (
-                coors[:, 1].type_as(features) * self.vz + self.z_offset)
-            features_ls.append(f_center)
-
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
-            features_ls.append(points_dist)
-
-        # Combine together feature decorations
-        features = torch.cat(features_ls, dim=-1)
-        for i, pfn in enumerate(self.pfn_layers):
-            point_feats = pfn(features)
-            voxel_feats, voxel_coors = self.pfn_scatter(point_feats, coors)
-            if i != len(self.pfn_layers) - 1:
-                # need to concat voxel feats if it is not the last pfn
-                feat_per_point = self.map_voxel_center_to_point(
-                    coors, voxel_feats, voxel_coors)
-                features = torch.cat([point_feats, feat_per_point], dim=1)
-
-        return voxel_feats, voxel_coors
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.ops import DynamicScatter
+from mmcv.runner import force_fp32
+from torch import nn
+
+from ..builder import VOXEL_ENCODERS
+from .utils import PFNLayer, get_paddings_indicator
+
+
+@VOXEL_ENCODERS.register_module()
+class PillarFeatureNet(nn.Module):
+    """Pillar Feature Net.
+
+    The network prepares the pillar features and performs forward pass
+    through PFNLayers.
+
+    Args:
+        in_channels (int, optional): Number of input features,
+            either x, y, z or x, y, z, r. Defaults to 4.
+        feat_channels (tuple, optional): Number of features in each of the
+            N PFNLayers. Defaults to (64, ).
+        with_distance (bool, optional): Whether to include Euclidean distance
+            to points. Defaults to False.
+        with_cluster_center (bool, optional): [description]. Defaults to True.
+        with_voxel_center (bool, optional): [description]. Defaults to True.
+        voxel_size (tuple[float], optional): Size of voxels, only utilize x
+            and y size. Defaults to (0.2, 0.2, 4).
+        point_cloud_range (tuple[float], optional): Point cloud range, only
+            utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg ([type], optional): [description].
+            Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
+        mode (str, optional): The mode to gather point features. Options are
+            'max' or 'avg'. Defaults to 'max'.
+        legacy (bool, optional): Whether to use the new behavior or
+            the original behavior. Defaults to True.
+    """
+
+    def __init__(self,
+                 in_channels=4,
+                 feat_channels=(64, ),
+                 with_distance=False,
+                 with_cluster_center=True,
+                 with_voxel_center=True,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode='max',
+                 legacy=True):
+        super(PillarFeatureNet, self).__init__()
+        assert len(feat_channels) > 0
+        self.legacy = legacy
+        if with_cluster_center:
+            in_channels += 3
+        if with_voxel_center:
+            in_channels += 3
+        if with_distance:
+            in_channels += 1
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+        self.fp16_enabled = False
+        # Create PillarFeatureNet layers
+        self.in_channels = in_channels
+        feat_channels = [in_channels] + list(feat_channels)
+        pfn_layers = []
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            if i < len(feat_channels) - 2:
+                last_layer = False
+            else:
+                last_layer = True
+            pfn_layers.append(
+                PFNLayer(
+                    in_filters,
+                    out_filters,
+                    norm_cfg=norm_cfg,
+                    last_layer=last_layer,
+                    mode=mode))
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+
+        # Need pillar (voxel) size and x/y offset in order to calculate offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+        self.point_cloud_range = point_cloud_range
+
+    @force_fp32(out_fp16=True)
+    def forward(self, features, num_points, coors):
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): Point features or raw points in shape
+                (N, M, C).
+            num_points (torch.Tensor): Number of points in each pillar.
+            coors (torch.Tensor): Coordinates of each voxel.
+
+        Returns:
+            torch.Tensor: Features of pillars.
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            points_mean = features[:, :, :3].sum(
+                dim=1, keepdim=True) / num_points.type_as(features).view(
+                    -1, 1, 1)
+            f_cluster = features[:, :, :3] - points_mean
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        dtype = features.dtype
+        if self._with_voxel_center:
+            if not self.legacy:
+                f_center = torch.zeros_like(features[:, :, :3])
+                f_center[:, :, 0] = features[:, :, 0] - (
+                    coors[:, 3].to(dtype).unsqueeze(1) * self.vx +
+                    self.x_offset)
+                f_center[:, :, 1] = features[:, :, 1] - (
+                    coors[:, 2].to(dtype).unsqueeze(1) * self.vy +
+                    self.y_offset)
+                f_center[:, :, 2] = features[:, :, 2] - (
+                    coors[:, 1].to(dtype).unsqueeze(1) * self.vz +
+                    self.z_offset)
+            else:
+                f_center = features[:, :, :3]
+                f_center[:, :, 0] = f_center[:, :, 0] - (
+                    coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+                    self.x_offset)
+                f_center[:, :, 1] = f_center[:, :, 1] - (
+                    coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+                    self.y_offset)
+                f_center[:, :, 2] = f_center[:, :, 2] - (
+                    coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
+                    self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        # The feature decorations were calculated without regard to whether
+        # pillar was empty. Need to ensure that
+        # empty pillars remain set to zeros.
+        voxel_count = features.shape[1]
+        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
+        mask = torch.unsqueeze(mask, -1).type_as(features)
+        features *= mask
+
+        for pfn in self.pfn_layers:
+            features = pfn(features, num_points)
+
+        return features.squeeze(1)
+
+
+@VOXEL_ENCODERS.register_module()
+class DynamicPillarFeatureNet(PillarFeatureNet):
+    """Pillar Feature Net using dynamic voxelization.
+
+    The network prepares the pillar features and performs forward pass
+    through PFNLayers. The main difference is that it is used for
+    dynamic voxels, which contains different number of points inside a voxel
+    without limits.
+
+    Args:
+        in_channels (int, optional): Number of input features,
+            either x, y, z or x, y, z, r. Defaults to 4.
+        feat_channels (tuple, optional): Number of features in each of the
+            N PFNLayers. Defaults to (64, ).
+        with_distance (bool, optional): Whether to include Euclidean distance
+            to points. Defaults to False.
+        with_cluster_center (bool, optional): [description]. Defaults to True.
+        with_voxel_center (bool, optional): [description]. Defaults to True.
+        voxel_size (tuple[float], optional): Size of voxels, only utilize x
+            and y size. Defaults to (0.2, 0.2, 4).
+        point_cloud_range (tuple[float], optional): Point cloud range, only
+            utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg ([type], optional): [description].
+            Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
+        mode (str, optional): The mode to gather point features. Options are
+            'max' or 'avg'. Defaults to 'max'.
+        legacy (bool, optional): Whether to use the new behavior or
+            the original behavior. Defaults to True.
+    """
+
+    def __init__(self,
+                 in_channels=4,
+                 feat_channels=(64, ),
+                 with_distance=False,
+                 with_cluster_center=True,
+                 with_voxel_center=True,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode='max',
+                 legacy=True):
+        super(DynamicPillarFeatureNet, self).__init__(
+            in_channels,
+            feat_channels,
+            with_distance,
+            with_cluster_center=with_cluster_center,
+            with_voxel_center=with_voxel_center,
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            norm_cfg=norm_cfg,
+            mode=mode,
+            legacy=legacy)
+        self.fp16_enabled = False
+        feat_channels = [self.in_channels] + list(feat_channels)
+        pfn_layers = []
+        # TODO: currently only support one PFNLayer
+
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            if i > 0:
+                in_filters *= 2
+            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            pfn_layers.append(
+                nn.Sequential(
+                    nn.Linear(in_filters, out_filters, bias=False), norm_layer,
+                    nn.ReLU(inplace=True)))
+        self.num_pfn = len(pfn_layers)
+        self.pfn_layers = nn.ModuleList(pfn_layers)
+        self.pfn_scatter = DynamicScatter(voxel_size, point_cloud_range,
+                                          (mode != 'max'))
+        self.cluster_scatter = DynamicScatter(
+            voxel_size, point_cloud_range, average_points=True)
+
+    def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
+        """Map the centers of voxels to its corresponding points.
+
+        Args:
+            pts_coors (torch.Tensor): The coordinates of each points, shape
+                (M, 3), where M is the number of points.
+            voxel_mean (torch.Tensor): The mean or aggregated features of a
+                voxel, shape (N, C), where N is the number of voxels.
+            voxel_coors (torch.Tensor): The coordinates of each voxel.
+
+        Returns:
+            torch.Tensor: Corresponding voxel centers of each points, shape
+                (M, C), where M is the number of points.
+        """
+        # Step 1: scatter voxel into canvas
+        # Calculate necessary things for canvas creation
+        canvas_y = int(
+            (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
+        canvas_x = int(
+            (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
+        canvas_channel = voxel_mean.size(1)
+        batch_size = pts_coors[-1, 0] + 1
+        canvas_len = canvas_y * canvas_x * batch_size
+        # Create the canvas for this sample
+        canvas = voxel_mean.new_zeros(canvas_channel, canvas_len)
+        # Only include non-empty pillars
+        indices = (
+            voxel_coors[:, 0] * canvas_y * canvas_x +
+            voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
+        # Scatter the blob back to the canvas
+        canvas[:, indices.long()] = voxel_mean.t()
+
+        # Step 2: get voxel mean for each point
+        voxel_index = (
+            pts_coors[:, 0] * canvas_y * canvas_x +
+            pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
+        center_per_point = canvas[:, voxel_index.long()].t()
+        return center_per_point
+
+    @force_fp32(out_fp16=True)
+    def forward(self, features, coors):
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): Point features or raw points in shape
+                (N, M, C).
+            coors (torch.Tensor): Coordinates of each voxel
+
+        Returns:
+            torch.Tensor: Features of pillars.
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            voxel_mean, mean_coors = self.cluster_scatter(features, coors)
+            points_mean = self.map_voxel_center_to_point(
+                coors, voxel_mean, mean_coors)
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :3] - points_mean[:, :3]
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(size=(features.size(0), 3))
+            f_center[:, 0] = features[:, 0] - (
+                coors[:, 3].type_as(features) * self.vx + self.x_offset)
+            f_center[:, 1] = features[:, 1] - (
+                coors[:, 2].type_as(features) * self.vy + self.y_offset)
+            f_center[:, 2] = features[:, 2] - (
+                coors[:, 1].type_as(features) * self.vz + self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        for i, pfn in enumerate(self.pfn_layers):
+            point_feats = pfn(features)
+            voxel_feats, voxel_coors = self.pfn_scatter(point_feats, coors)
+            if i != len(self.pfn_layers) - 1:
+                # need to concat voxel feats if it is not the last pfn
+                feat_per_point = self.map_voxel_center_to_point(
+                    coors, voxel_feats, voxel_coors)
+                features = torch.cat([point_feats, feat_per_point], dim=1)
+
+        return voxel_feats, voxel_coors
diff --git a/mmdet3d/models/voxel_encoders/utils.py b/mmdet3d/models/voxel_encoders/utils.py
index 8c54fc2..8e3a010 100644
--- a/mmdet3d/models/voxel_encoders/utils.py
+++ b/mmdet3d/models/voxel_encoders/utils.py
@@ -1,182 +1,182 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import build_norm_layer
-from mmcv.runner import auto_fp16
-from torch import nn
-from torch.nn import functional as F
-
-
-def get_paddings_indicator(actual_num, max_num, axis=0):
-    """Create boolean mask by actually number of a padded tensor.
-
-    Args:
-        actual_num (torch.Tensor): Actual number of points in each voxel.
-        max_num (int): Max number of points in each voxel
-
-    Returns:
-        torch.Tensor: Mask indicates which points are valid inside a voxel.
-    """
-    actual_num = torch.unsqueeze(actual_num, axis + 1)
-    # tiled_actual_num: [N, M, 1]
-    max_num_shape = [1] * len(actual_num.shape)
-    max_num_shape[axis + 1] = -1
-    max_num = torch.arange(
-        max_num, dtype=torch.int, device=actual_num.device).view(max_num_shape)
-    # tiled_actual_num: [[3,3,3,3,3], [4,4,4,4,4], [2,2,2,2,2]]
-    # tiled_max_num: [[0,1,2,3,4], [0,1,2,3,4], [0,1,2,3,4]]
-    paddings_indicator = actual_num.int() > max_num
-    # paddings_indicator shape: [batch_size, max_num]
-    return paddings_indicator
-
-
-class VFELayer(nn.Module):
-    """Voxel Feature Encoder layer.
-
-    The voxel encoder is composed of a series of these layers.
-    This module do not support average pooling and only support to use
-    max pooling to gather features inside a VFE.
-
-    Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        norm_cfg (dict): Config dict of normalization layers
-        max_out (bool): Whether aggregate the features of points inside
-            each voxel and only return voxel features.
-        cat_max (bool): Whether concatenate the aggregated features
-            and pointwise features.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 max_out=True,
-                 cat_max=True):
-        super(VFELayer, self).__init__()
-        self.fp16_enabled = False
-        self.cat_max = cat_max
-        self.max_out = max_out
-        # self.units = int(out_channels / 2)
-
-        self.norm = build_norm_layer(norm_cfg, out_channels)[1]
-        self.linear = nn.Linear(in_channels, out_channels, bias=False)
-
-    @auto_fp16(apply_to=('inputs'), out_fp32=True)
-    def forward(self, inputs):
-        """Forward function.
-
-        Args:
-            inputs (torch.Tensor): Voxels features of shape (N, M, C).
-                N is the number of voxels, M is the number of points in
-                voxels, C is the number of channels of point features.
-
-        Returns:
-            torch.Tensor: Voxel features. There are three mode under which the
-                features have different meaning.
-                - `max_out=False`: Return point-wise features in
-                    shape (N, M, C).
-                - `max_out=True` and `cat_max=False`: Return aggregated
-                    voxel features in shape (N, C)
-                - `max_out=True` and `cat_max=True`: Return concatenated
-                    point-wise features in shape (N, M, C).
-        """
-        # [K, T, 7] tensordot [7, units] = [K, T, units]
-        voxel_count = inputs.shape[1]
-
-        x = self.linear(inputs)
-        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
-                                                               1).contiguous()
-        pointwise = F.relu(x)
-        # [K, T, units]
-        if self.max_out:
-            aggregated = torch.max(pointwise, dim=1, keepdim=True)[0]
-        else:
-            # this is for fusion layer
-            return pointwise
-
-        if not self.cat_max:
-            return aggregated.squeeze(1)
-        else:
-            # [K, 1, units]
-            repeated = aggregated.repeat(1, voxel_count, 1)
-            concatenated = torch.cat([pointwise, repeated], dim=2)
-            # [K, T, 2 * units]
-            return concatenated
-
-
-class PFNLayer(nn.Module):
-    """Pillar Feature Net Layer.
-
-    The Pillar Feature Net is composed of a series of these layers, but the
-    PointPillars paper results only used a single PFNLayer.
-
-    Args:
-        in_channels (int): Number of input channels.
-        out_channels (int): Number of output channels.
-        norm_cfg (dict, optional): Config dict of normalization layers.
-            Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
-        last_layer (bool, optional): If last_layer, there is no
-            concatenation of features. Defaults to False.
-        mode (str, optional): Pooling model to gather features inside voxels.
-            Defaults to 'max'.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 last_layer=False,
-                 mode='max'):
-
-        super().__init__()
-        self.fp16_enabled = False
-        self.name = 'PFNLayer'
-        self.last_vfe = last_layer
-        if not self.last_vfe:
-            out_channels = out_channels // 2
-        self.units = out_channels
-
-        self.norm = build_norm_layer(norm_cfg, self.units)[1]
-        self.linear = nn.Linear(in_channels, self.units, bias=False)
-
-        assert mode in ['max', 'avg']
-        self.mode = mode
-
-    @auto_fp16(apply_to=('inputs'), out_fp32=True)
-    def forward(self, inputs, num_voxels=None, aligned_distance=None):
-        """Forward function.
-
-        Args:
-            inputs (torch.Tensor): Pillar/Voxel inputs with shape (N, M, C).
-                N is the number of voxels, M is the number of points in
-                voxels, C is the number of channels of point features.
-            num_voxels (torch.Tensor, optional): Number of points in each
-                voxel. Defaults to None.
-            aligned_distance (torch.Tensor, optional): The distance of
-                each points to the voxel center. Defaults to None.
-
-        Returns:
-            torch.Tensor: Features of Pillars.
-        """
-        x = self.linear(inputs)
-        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
-                                                               1).contiguous()
-        x = F.relu(x)
-
-        if self.mode == 'max':
-            if aligned_distance is not None:
-                x = x.mul(aligned_distance.unsqueeze(-1))
-            x_max = torch.max(x, dim=1, keepdim=True)[0]
-        elif self.mode == 'avg':
-            if aligned_distance is not None:
-                x = x.mul(aligned_distance.unsqueeze(-1))
-            x_max = x.sum(
-                dim=1, keepdim=True) / num_voxels.type_as(inputs).view(
-                    -1, 1, 1)
-
-        if self.last_vfe:
-            return x_max
-        else:
-            x_repeat = x_max.repeat(1, inputs.shape[1], 1)
-            x_concatenated = torch.cat([x, x_repeat], dim=2)
-            return x_concatenated
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.runner import auto_fp16
+from torch import nn
+from torch.nn import functional as F
+
+
+def get_paddings_indicator(actual_num, max_num, axis=0):
+    """Create boolean mask by actually number of a padded tensor.
+
+    Args:
+        actual_num (torch.Tensor): Actual number of points in each voxel.
+        max_num (int): Max number of points in each voxel
+
+    Returns:
+        torch.Tensor: Mask indicates which points are valid inside a voxel.
+    """
+    actual_num = torch.unsqueeze(actual_num, axis + 1)
+    # tiled_actual_num: [N, M, 1]
+    max_num_shape = [1] * len(actual_num.shape)
+    max_num_shape[axis + 1] = -1
+    max_num = torch.arange(
+        max_num, dtype=torch.int, device=actual_num.device).view(max_num_shape)
+    # tiled_actual_num: [[3,3,3,3,3], [4,4,4,4,4], [2,2,2,2,2]]
+    # tiled_max_num: [[0,1,2,3,4], [0,1,2,3,4], [0,1,2,3,4]]
+    paddings_indicator = actual_num.int() > max_num
+    # paddings_indicator shape: [batch_size, max_num]
+    return paddings_indicator
+
+
+class VFELayer(nn.Module):
+    """Voxel Feature Encoder layer.
+
+    The voxel encoder is composed of a series of these layers.
+    This module do not support average pooling and only support to use
+    max pooling to gather features inside a VFE.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        norm_cfg (dict): Config dict of normalization layers
+        max_out (bool): Whether aggregate the features of points inside
+            each voxel and only return voxel features.
+        cat_max (bool): Whether concatenate the aggregated features
+            and pointwise features.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 max_out=True,
+                 cat_max=True):
+        super(VFELayer, self).__init__()
+        self.fp16_enabled = False
+        self.cat_max = cat_max
+        self.max_out = max_out
+        # self.units = int(out_channels / 2)
+
+        self.norm = build_norm_layer(norm_cfg, out_channels)[1]
+        self.linear = nn.Linear(in_channels, out_channels, bias=False)
+
+    @auto_fp16(apply_to=('inputs'), out_fp32=True)
+    def forward(self, inputs):
+        """Forward function.
+
+        Args:
+            inputs (torch.Tensor): Voxels features of shape (N, M, C).
+                N is the number of voxels, M is the number of points in
+                voxels, C is the number of channels of point features.
+
+        Returns:
+            torch.Tensor: Voxel features. There are three mode under which the
+                features have different meaning.
+                - `max_out=False`: Return point-wise features in
+                    shape (N, M, C).
+                - `max_out=True` and `cat_max=False`: Return aggregated
+                    voxel features in shape (N, C)
+                - `max_out=True` and `cat_max=True`: Return concatenated
+                    point-wise features in shape (N, M, C).
+        """
+        # [K, T, 7] tensordot [7, units] = [K, T, units]
+        voxel_count = inputs.shape[1]
+
+        x = self.linear(inputs)
+        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
+                                                               1).contiguous()
+        pointwise = F.relu(x)
+        # [K, T, units]
+        if self.max_out:
+            aggregated = torch.max(pointwise, dim=1, keepdim=True)[0]
+        else:
+            # this is for fusion layer
+            return pointwise
+
+        if not self.cat_max:
+            return aggregated.squeeze(1)
+        else:
+            # [K, 1, units]
+            repeated = aggregated.repeat(1, voxel_count, 1)
+            concatenated = torch.cat([pointwise, repeated], dim=2)
+            # [K, T, 2 * units]
+            return concatenated
+
+
+class PFNLayer(nn.Module):
+    """Pillar Feature Net Layer.
+
+    The Pillar Feature Net is composed of a series of these layers, but the
+    PointPillars paper results only used a single PFNLayer.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        norm_cfg (dict, optional): Config dict of normalization layers.
+            Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
+        last_layer (bool, optional): If last_layer, there is no
+            concatenation of features. Defaults to False.
+        mode (str, optional): Pooling model to gather features inside voxels.
+            Defaults to 'max'.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 last_layer=False,
+                 mode='max'):
+
+        super().__init__()
+        self.fp16_enabled = False
+        self.name = 'PFNLayer'
+        self.last_vfe = last_layer
+        if not self.last_vfe:
+            out_channels = out_channels // 2
+        self.units = out_channels
+
+        self.norm = build_norm_layer(norm_cfg, self.units)[1]
+        self.linear = nn.Linear(in_channels, self.units, bias=False)
+
+        assert mode in ['max', 'avg']
+        self.mode = mode
+
+    @auto_fp16(apply_to=('inputs'), out_fp32=True)
+    def forward(self, inputs, num_voxels=None, aligned_distance=None):
+        """Forward function.
+
+        Args:
+            inputs (torch.Tensor): Pillar/Voxel inputs with shape (N, M, C).
+                N is the number of voxels, M is the number of points in
+                voxels, C is the number of channels of point features.
+            num_voxels (torch.Tensor, optional): Number of points in each
+                voxel. Defaults to None.
+            aligned_distance (torch.Tensor, optional): The distance of
+                each points to the voxel center. Defaults to None.
+
+        Returns:
+            torch.Tensor: Features of Pillars.
+        """
+        x = self.linear(inputs)
+        x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
+                                                               1).contiguous()
+        x = F.relu(x)
+
+        if self.mode == 'max':
+            if aligned_distance is not None:
+                x = x.mul(aligned_distance.unsqueeze(-1))
+            x_max = torch.max(x, dim=1, keepdim=True)[0]
+        elif self.mode == 'avg':
+            if aligned_distance is not None:
+                x = x.mul(aligned_distance.unsqueeze(-1))
+            x_max = x.sum(
+                dim=1, keepdim=True) / num_voxels.type_as(inputs).view(
+                    -1, 1, 1)
+
+        if self.last_vfe:
+            return x_max
+        else:
+            x_repeat = x_max.repeat(1, inputs.shape[1], 1)
+            x_concatenated = torch.cat([x, x_repeat], dim=2)
+            return x_concatenated
diff --git a/mmdet3d/models/voxel_encoders/voxel_encoder.py b/mmdet3d/models/voxel_encoders/voxel_encoder.py
index 9f3cf53..dbec200 100644
--- a/mmdet3d/models/voxel_encoders/voxel_encoder.py
+++ b/mmdet3d/models/voxel_encoders/voxel_encoder.py
@@ -1,489 +1,489 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import build_norm_layer
-from mmcv.ops import DynamicScatter
-from mmcv.runner import force_fp32
-from torch import nn
-
-from .. import builder
-from ..builder import VOXEL_ENCODERS
-from .utils import VFELayer, get_paddings_indicator
-
-
-@VOXEL_ENCODERS.register_module()
-class HardSimpleVFE(nn.Module):
-    """Simple voxel feature encoder used in SECOND.
-
-    It simply averages the values of points in a voxel.
-
-    Args:
-        num_features (int, optional): Number of features to use. Default: 4.
-    """
-
-    def __init__(self, num_features=4):
-        super(HardSimpleVFE, self).__init__()
-        self.num_features = num_features
-        self.fp16_enabled = False
-
-    @force_fp32(out_fp16=True)
-    def forward(self, features, num_points, coors):
-        """Forward function.
-
-        Args:
-            features (torch.Tensor): Point features in shape
-                (N, M, 3(4)). N is the number of voxels and M is the maximum
-                number of points inside a single voxel.
-            num_points (torch.Tensor): Number of points in each voxel,
-                 shape (N, ).
-            coors (torch.Tensor): Coordinates of voxels.
-
-        Returns:
-            torch.Tensor: Mean of points inside each voxel in shape (N, 3(4))
-        """
-        points_mean = features[:, :, :self.num_features].sum(
-            dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1)
-        return points_mean.contiguous()
-
-
-@VOXEL_ENCODERS.register_module()
-class DynamicSimpleVFE(nn.Module):
-    """Simple dynamic voxel feature encoder used in DV-SECOND.
-
-    It simply averages the values of points in a voxel.
-    But the number of points in a voxel is dynamic and varies.
-
-    Args:
-        voxel_size (tupe[float]): Size of a single voxel
-        point_cloud_range (tuple[float]): Range of the point cloud and voxels
-    """
-
-    def __init__(self,
-                 voxel_size=(0.2, 0.2, 4),
-                 point_cloud_range=(0, -40, -3, 70.4, 40, 1)):
-        super(DynamicSimpleVFE, self).__init__()
-        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
-        self.fp16_enabled = False
-
-    @torch.no_grad()
-    @force_fp32(out_fp16=True)
-    def forward(self, features, coors):
-        """Forward function.
-
-        Args:
-            features (torch.Tensor): Point features in shape
-                (N, 3(4)). N is the number of points.
-            coors (torch.Tensor): Coordinates of voxels.
-
-        Returns:
-            torch.Tensor: Mean of points inside each voxel in shape (M, 3(4)).
-                M is the number of voxels.
-        """
-        # This function is used from the start of the voxelnet
-        # num_points: [concated_num_points]
-        features, features_coors = self.scatter(features, coors)
-        return features, features_coors
-
-
-@VOXEL_ENCODERS.register_module()
-class DynamicVFE(nn.Module):
-    """Dynamic Voxel feature encoder used in DV-SECOND.
-
-    It encodes features of voxels and their points. It could also fuse
-    image feature into voxel features in a point-wise manner.
-    The number of points inside the voxel varies.
-
-    Args:
-        in_channels (int, optional): Input channels of VFE. Defaults to 4.
-        feat_channels (list(int), optional): Channels of features in VFE.
-        with_distance (bool, optional): Whether to use the L2 distance of
-            points to the origin point. Defaults to False.
-        with_cluster_center (bool, optional): Whether to use the distance
-            to cluster center of points inside a voxel. Defaults to False.
-        with_voxel_center (bool, optional): Whether to use the distance
-            to center of voxel for each points inside a voxel.
-            Defaults to False.
-        voxel_size (tuple[float], optional): Size of a single voxel.
-            Defaults to (0.2, 0.2, 4).
-        point_cloud_range (tuple[float], optional): The range of points
-            or voxels. Defaults to (0, -40, -3, 70.4, 40, 1).
-        norm_cfg (dict, optional): Config dict of normalization layers.
-        mode (str, optional): The mode when pooling features of points
-            inside a voxel. Available options include 'max' and 'avg'.
-            Defaults to 'max'.
-        fusion_layer (dict, optional): The config dict of fusion
-            layer used in multi-modal detectors. Defaults to None.
-        return_point_feats (bool, optional): Whether to return the features
-            of each points. Defaults to False.
-    """
-
-    def __init__(self,
-                 in_channels=4,
-                 feat_channels=[],
-                 with_distance=False,
-                 with_cluster_center=False,
-                 with_voxel_center=False,
-                 voxel_size=(0.2, 0.2, 4),
-                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 mode='max',
-                 fusion_layer=None,
-                 return_point_feats=False):
-        super(DynamicVFE, self).__init__()
-        assert mode in ['avg', 'max']
-        assert len(feat_channels) > 0
-        if with_cluster_center:
-            in_channels += 3
-        if with_voxel_center:
-            in_channels += 3
-        if with_distance:
-            in_channels += 1
-        self.in_channels = in_channels
-        self._with_distance = with_distance
-        self._with_cluster_center = with_cluster_center
-        self._with_voxel_center = with_voxel_center
-        self.return_point_feats = return_point_feats
-        self.fp16_enabled = False
-
-        # Need pillar (voxel) size and x/y offset in order to calculate offset
-        self.vx = voxel_size[0]
-        self.vy = voxel_size[1]
-        self.vz = voxel_size[2]
-        self.x_offset = self.vx / 2 + point_cloud_range[0]
-        self.y_offset = self.vy / 2 + point_cloud_range[1]
-        self.z_offset = self.vz / 2 + point_cloud_range[2]
-        self.point_cloud_range = point_cloud_range
-        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
-
-        feat_channels = [self.in_channels] + list(feat_channels)
-        vfe_layers = []
-        for i in range(len(feat_channels) - 1):
-            in_filters = feat_channels[i]
-            out_filters = feat_channels[i + 1]
-            if i > 0:
-                in_filters *= 2
-            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
-            vfe_layers.append(
-                nn.Sequential(
-                    nn.Linear(in_filters, out_filters, bias=False), norm_layer,
-                    nn.ReLU(inplace=True)))
-        self.vfe_layers = nn.ModuleList(vfe_layers)
-        self.num_vfe = len(vfe_layers)
-        self.vfe_scatter = DynamicScatter(voxel_size, point_cloud_range,
-                                          (mode != 'max'))
-        self.cluster_scatter = DynamicScatter(
-            voxel_size, point_cloud_range, average_points=True)
-        self.fusion_layer = None
-        if fusion_layer is not None:
-            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
-
-    def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
-        """Map voxel features to its corresponding points.
-
-        Args:
-            pts_coors (torch.Tensor): Voxel coordinate of each point.
-            voxel_mean (torch.Tensor): Voxel features to be mapped.
-            voxel_coors (torch.Tensor): Coordinates of valid voxels
-
-        Returns:
-            torch.Tensor: Features or centers of each point.
-        """
-        # Step 1: scatter voxel into canvas
-        # Calculate necessary things for canvas creation
-        canvas_z = int(
-            (self.point_cloud_range[5] - self.point_cloud_range[2]) / self.vz)
-        canvas_y = int(
-            (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
-        canvas_x = int(
-            (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
-        # canvas_channel = voxel_mean.size(1)
-        batch_size = pts_coors[-1, 0] + 1
-        canvas_len = canvas_z * canvas_y * canvas_x * batch_size
-        # Create the canvas for this sample
-        canvas = voxel_mean.new_zeros(canvas_len, dtype=torch.long)
-        # Only include non-empty pillars
-        indices = (
-            voxel_coors[:, 0] * canvas_z * canvas_y * canvas_x +
-            voxel_coors[:, 1] * canvas_y * canvas_x +
-            voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
-        # Scatter the blob back to the canvas
-        canvas[indices.long()] = torch.arange(
-            start=0, end=voxel_mean.size(0), device=voxel_mean.device)
-
-        # Step 2: get voxel mean for each point
-        voxel_index = (
-            pts_coors[:, 0] * canvas_z * canvas_y * canvas_x +
-            pts_coors[:, 1] * canvas_y * canvas_x +
-            pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
-        voxel_inds = canvas[voxel_index.long()]
-        center_per_point = voxel_mean[voxel_inds, ...]
-        return center_per_point
-
-    @force_fp32(out_fp16=True)
-    def forward(self,
-                features,
-                coors,
-                points=None,
-                img_feats=None,
-                img_metas=None):
-        """Forward functions.
-
-        Args:
-            features (torch.Tensor): Features of voxels, shape is NxC.
-            coors (torch.Tensor): Coordinates of voxels, shape is  Nx(1+NDim).
-            points (list[torch.Tensor], optional): Raw points used to guide the
-                multi-modality fusion. Defaults to None.
-            img_feats (list[torch.Tensor], optional): Image features used for
-                multi-modality fusion. Defaults to None.
-            img_metas (dict, optional): [description]. Defaults to None.
-
-        Returns:
-            tuple: If `return_point_feats` is False, returns voxel features and
-                its coordinates. If `return_point_feats` is True, returns
-                feature of each points inside voxels.
-        """
-        features_ls = [features]
-        # Find distance of x, y, and z from cluster center
-        if self._with_cluster_center:
-            voxel_mean, mean_coors = self.cluster_scatter(features, coors)
-            points_mean = self.map_voxel_center_to_point(
-                coors, voxel_mean, mean_coors)
-            # TODO: maybe also do cluster for reflectivity
-            f_cluster = features[:, :3] - points_mean[:, :3]
-            features_ls.append(f_cluster)
-
-        # Find distance of x, y, and z from pillar center
-        if self._with_voxel_center:
-            f_center = features.new_zeros(size=(features.size(0), 3))
-            f_center[:, 0] = features[:, 0] - (
-                coors[:, 3].type_as(features) * self.vx + self.x_offset)
-            f_center[:, 1] = features[:, 1] - (
-                coors[:, 2].type_as(features) * self.vy + self.y_offset)
-            f_center[:, 2] = features[:, 2] - (
-                coors[:, 1].type_as(features) * self.vz + self.z_offset)
-            features_ls.append(f_center)
-
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
-            features_ls.append(points_dist)
-
-        # Combine together feature decorations
-        features = torch.cat(features_ls, dim=-1)
-        for i, vfe in enumerate(self.vfe_layers):
-            point_feats = vfe(features)
-            if (i == len(self.vfe_layers) - 1 and self.fusion_layer is not None
-                    and img_feats is not None):
-                point_feats = self.fusion_layer(img_feats, points, point_feats,
-                                                img_metas)
-            voxel_feats, voxel_coors = self.vfe_scatter(point_feats, coors)
-            if i != len(self.vfe_layers) - 1:
-                # need to concat voxel feats if it is not the last vfe
-                feat_per_point = self.map_voxel_center_to_point(
-                    coors, voxel_feats, voxel_coors)
-                features = torch.cat([point_feats, feat_per_point], dim=1)
-
-        if self.return_point_feats:
-            return point_feats
-        return voxel_feats, voxel_coors
-
-
-@VOXEL_ENCODERS.register_module()
-class HardVFE(nn.Module):
-    """Voxel feature encoder used in DV-SECOND.
-
-    It encodes features of voxels and their points. It could also fuse
-    image feature into voxel features in a point-wise manner.
-
-    Args:
-        in_channels (int, optional): Input channels of VFE. Defaults to 4.
-        feat_channels (list(int), optional): Channels of features in VFE.
-        with_distance (bool, optional): Whether to use the L2 distance
-            of points to the origin point. Defaults to False.
-        with_cluster_center (bool, optional): Whether to use the distance
-            to cluster center of points inside a voxel. Defaults to False.
-        with_voxel_center (bool, optional): Whether to use the distance to
-            center of voxel for each points inside a voxel. Defaults to False.
-        voxel_size (tuple[float], optional): Size of a single voxel.
-            Defaults to (0.2, 0.2, 4).
-        point_cloud_range (tuple[float], optional): The range of points
-            or voxels. Defaults to (0, -40, -3, 70.4, 40, 1).
-        norm_cfg (dict, optional): Config dict of normalization layers.
-        mode (str, optional): The mode when pooling features of points inside a
-            voxel. Available options include 'max' and 'avg'.
-            Defaults to 'max'.
-        fusion_layer (dict, optional): The config dict of fusion layer
-            used in multi-modal detectors. Defaults to None.
-        return_point_feats (bool, optional): Whether to return the
-            features of each points. Defaults to False.
-    """
-
-    def __init__(self,
-                 in_channels=4,
-                 feat_channels=[],
-                 with_distance=False,
-                 with_cluster_center=False,
-                 with_voxel_center=False,
-                 voxel_size=(0.2, 0.2, 4),
-                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
-                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
-                 mode='max',
-                 fusion_layer=None,
-                 return_point_feats=False):
-        super(HardVFE, self).__init__()
-        assert len(feat_channels) > 0
-        if with_cluster_center:
-            in_channels += 3
-        if with_voxel_center:
-            in_channels += 3
-        if with_distance:
-            in_channels += 1
-        self.in_channels = in_channels
-        self._with_distance = with_distance
-        self._with_cluster_center = with_cluster_center
-        self._with_voxel_center = with_voxel_center
-        self.return_point_feats = return_point_feats
-        self.fp16_enabled = False
-
-        # Need pillar (voxel) size and x/y offset to calculate pillar offset
-        self.vx = voxel_size[0]
-        self.vy = voxel_size[1]
-        self.vz = voxel_size[2]
-        self.x_offset = self.vx / 2 + point_cloud_range[0]
-        self.y_offset = self.vy / 2 + point_cloud_range[1]
-        self.z_offset = self.vz / 2 + point_cloud_range[2]
-        self.point_cloud_range = point_cloud_range
-        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
-
-        feat_channels = [self.in_channels] + list(feat_channels)
-        vfe_layers = []
-        for i in range(len(feat_channels) - 1):
-            in_filters = feat_channels[i]
-            out_filters = feat_channels[i + 1]
-            if i > 0:
-                in_filters *= 2
-            # TODO: pass norm_cfg to VFE
-            # norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
-            if i == (len(feat_channels) - 2):
-                cat_max = False
-                max_out = True
-                if fusion_layer:
-                    max_out = False
-            else:
-                max_out = True
-                cat_max = True
-            vfe_layers.append(
-                VFELayer(
-                    in_filters,
-                    out_filters,
-                    norm_cfg=norm_cfg,
-                    max_out=max_out,
-                    cat_max=cat_max))
-            self.vfe_layers = nn.ModuleList(vfe_layers)
-        self.num_vfe = len(vfe_layers)
-
-        self.fusion_layer = None
-        if fusion_layer is not None:
-            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
-
-    @force_fp32(out_fp16=True)
-    def forward(self,
-                features,
-                num_points,
-                coors,
-                img_feats=None,
-                img_metas=None):
-        """Forward functions.
-
-        Args:
-            features (torch.Tensor): Features of voxels, shape is MxNxC.
-            num_points (torch.Tensor): Number of points in each voxel.
-            coors (torch.Tensor): Coordinates of voxels, shape is Mx(1+NDim).
-            img_feats (list[torch.Tensor], optional): Image features used for
-                multi-modality fusion. Defaults to None.
-            img_metas (dict, optional): [description]. Defaults to None.
-
-        Returns:
-            tuple: If `return_point_feats` is False, returns voxel features and
-                its coordinates. If `return_point_feats` is True, returns
-                feature of each points inside voxels.
-        """
-        features_ls = [features]
-        # Find distance of x, y, and z from cluster center
-        if self._with_cluster_center:
-            points_mean = (
-                features[:, :, :3].sum(dim=1, keepdim=True) /
-                num_points.type_as(features).view(-1, 1, 1))
-            # TODO: maybe also do cluster for reflectivity
-            f_cluster = features[:, :, :3] - points_mean
-            features_ls.append(f_cluster)
-
-        # Find distance of x, y, and z from pillar center
-        if self._with_voxel_center:
-            f_center = features.new_zeros(
-                size=(features.size(0), features.size(1), 3))
-            f_center[:, :, 0] = features[:, :, 0] - (
-                coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
-                self.x_offset)
-            f_center[:, :, 1] = features[:, :, 1] - (
-                coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
-                self.y_offset)
-            f_center[:, :, 2] = features[:, :, 2] - (
-                coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
-                self.z_offset)
-            features_ls.append(f_center)
-
-        if self._with_distance:
-            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
-            features_ls.append(points_dist)
-
-        # Combine together feature decorations
-        voxel_feats = torch.cat(features_ls, dim=-1)
-        # The feature decorations were calculated without regard to whether
-        # pillar was empty.
-        # Need to ensure that empty voxels remain set to zeros.
-        voxel_count = voxel_feats.shape[1]
-        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
-        voxel_feats *= mask.unsqueeze(-1).type_as(voxel_feats)
-
-        for i, vfe in enumerate(self.vfe_layers):
-            voxel_feats = vfe(voxel_feats)
-
-        if (self.fusion_layer is not None and img_feats is not None):
-            voxel_feats = self.fusion_with_mask(features, mask, voxel_feats,
-                                                coors, img_feats, img_metas)
-
-        return voxel_feats
-
-    def fusion_with_mask(self, features, mask, voxel_feats, coors, img_feats,
-                         img_metas):
-        """Fuse image and point features with mask.
-
-        Args:
-            features (torch.Tensor): Features of voxel, usually it is the
-                values of points in voxels.
-            mask (torch.Tensor): Mask indicates valid features in each voxel.
-            voxel_feats (torch.Tensor): Features of voxels.
-            coors (torch.Tensor): Coordinates of each single voxel.
-            img_feats (list[torch.Tensor]): Multi-scale feature maps of image.
-            img_metas (list(dict)): Meta information of image and points.
-
-        Returns:
-            torch.Tensor: Fused features of each voxel.
-        """
-        # the features is consist of a batch of points
-        batch_size = coors[-1, 0] + 1
-        points = []
-        for i in range(batch_size):
-            single_mask = (coors[:, 0] == i)
-            points.append(features[single_mask][mask[single_mask]])
-
-        point_feats = voxel_feats[mask]
-        point_feats = self.fusion_layer(img_feats, points, point_feats,
-                                        img_metas)
-
-        voxel_canvas = voxel_feats.new_zeros(
-            size=(voxel_feats.size(0), voxel_feats.size(1),
-                  point_feats.size(-1)))
-        voxel_canvas[mask] = point_feats
-        out = torch.max(voxel_canvas, dim=1)[0]
-
-        return out
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import build_norm_layer
+from mmcv.ops import DynamicScatter
+from mmcv.runner import force_fp32
+from torch import nn
+
+from .. import builder
+from ..builder import VOXEL_ENCODERS
+from .utils import VFELayer, get_paddings_indicator
+
+
+@VOXEL_ENCODERS.register_module()
+class HardSimpleVFE(nn.Module):
+    """Simple voxel feature encoder used in SECOND.
+
+    It simply averages the values of points in a voxel.
+
+    Args:
+        num_features (int, optional): Number of features to use. Default: 4.
+    """
+
+    def __init__(self, num_features=4):
+        super(HardSimpleVFE, self).__init__()
+        self.num_features = num_features
+        self.fp16_enabled = False
+
+    @force_fp32(out_fp16=True)
+    def forward(self, features, num_points, coors):
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): Point features in shape
+                (N, M, 3(4)). N is the number of voxels and M is the maximum
+                number of points inside a single voxel.
+            num_points (torch.Tensor): Number of points in each voxel,
+                 shape (N, ).
+            coors (torch.Tensor): Coordinates of voxels.
+
+        Returns:
+            torch.Tensor: Mean of points inside each voxel in shape (N, 3(4))
+        """
+        points_mean = features[:, :, :self.num_features].sum(
+            dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1)
+        return points_mean.contiguous()
+
+
+@VOXEL_ENCODERS.register_module()
+class DynamicSimpleVFE(nn.Module):
+    """Simple dynamic voxel feature encoder used in DV-SECOND.
+
+    It simply averages the values of points in a voxel.
+    But the number of points in a voxel is dynamic and varies.
+
+    Args:
+        voxel_size (tupe[float]): Size of a single voxel
+        point_cloud_range (tuple[float]): Range of the point cloud and voxels
+    """
+
+    def __init__(self,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1)):
+        super(DynamicSimpleVFE, self).__init__()
+        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
+        self.fp16_enabled = False
+
+    @torch.no_grad()
+    @force_fp32(out_fp16=True)
+    def forward(self, features, coors):
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): Point features in shape
+                (N, 3(4)). N is the number of points.
+            coors (torch.Tensor): Coordinates of voxels.
+
+        Returns:
+            torch.Tensor: Mean of points inside each voxel in shape (M, 3(4)).
+                M is the number of voxels.
+        """
+        # This function is used from the start of the voxelnet
+        # num_points: [concated_num_points]
+        features, features_coors = self.scatter(features, coors)
+        return features, features_coors
+
+
+@VOXEL_ENCODERS.register_module()
+class DynamicVFE(nn.Module):
+    """Dynamic Voxel feature encoder used in DV-SECOND.
+
+    It encodes features of voxels and their points. It could also fuse
+    image feature into voxel features in a point-wise manner.
+    The number of points inside the voxel varies.
+
+    Args:
+        in_channels (int, optional): Input channels of VFE. Defaults to 4.
+        feat_channels (list(int), optional): Channels of features in VFE.
+        with_distance (bool, optional): Whether to use the L2 distance of
+            points to the origin point. Defaults to False.
+        with_cluster_center (bool, optional): Whether to use the distance
+            to cluster center of points inside a voxel. Defaults to False.
+        with_voxel_center (bool, optional): Whether to use the distance
+            to center of voxel for each points inside a voxel.
+            Defaults to False.
+        voxel_size (tuple[float], optional): Size of a single voxel.
+            Defaults to (0.2, 0.2, 4).
+        point_cloud_range (tuple[float], optional): The range of points
+            or voxels. Defaults to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg (dict, optional): Config dict of normalization layers.
+        mode (str, optional): The mode when pooling features of points
+            inside a voxel. Available options include 'max' and 'avg'.
+            Defaults to 'max'.
+        fusion_layer (dict, optional): The config dict of fusion
+            layer used in multi-modal detectors. Defaults to None.
+        return_point_feats (bool, optional): Whether to return the features
+            of each points. Defaults to False.
+    """
+
+    def __init__(self,
+                 in_channels=4,
+                 feat_channels=[],
+                 with_distance=False,
+                 with_cluster_center=False,
+                 with_voxel_center=False,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode='max',
+                 fusion_layer=None,
+                 return_point_feats=False):
+        super(DynamicVFE, self).__init__()
+        assert mode in ['avg', 'max']
+        assert len(feat_channels) > 0
+        if with_cluster_center:
+            in_channels += 3
+        if with_voxel_center:
+            in_channels += 3
+        if with_distance:
+            in_channels += 1
+        self.in_channels = in_channels
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+        self.return_point_feats = return_point_feats
+        self.fp16_enabled = False
+
+        # Need pillar (voxel) size and x/y offset in order to calculate offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+        self.point_cloud_range = point_cloud_range
+        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
+
+        feat_channels = [self.in_channels] + list(feat_channels)
+        vfe_layers = []
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            if i > 0:
+                in_filters *= 2
+            norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            vfe_layers.append(
+                nn.Sequential(
+                    nn.Linear(in_filters, out_filters, bias=False), norm_layer,
+                    nn.ReLU(inplace=True)))
+        self.vfe_layers = nn.ModuleList(vfe_layers)
+        self.num_vfe = len(vfe_layers)
+        self.vfe_scatter = DynamicScatter(voxel_size, point_cloud_range,
+                                          (mode != 'max'))
+        self.cluster_scatter = DynamicScatter(
+            voxel_size, point_cloud_range, average_points=True)
+        self.fusion_layer = None
+        if fusion_layer is not None:
+            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
+
+    def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
+        """Map voxel features to its corresponding points.
+
+        Args:
+            pts_coors (torch.Tensor): Voxel coordinate of each point.
+            voxel_mean (torch.Tensor): Voxel features to be mapped.
+            voxel_coors (torch.Tensor): Coordinates of valid voxels
+
+        Returns:
+            torch.Tensor: Features or centers of each point.
+        """
+        # Step 1: scatter voxel into canvas
+        # Calculate necessary things for canvas creation
+        canvas_z = int(
+            (self.point_cloud_range[5] - self.point_cloud_range[2]) / self.vz)
+        canvas_y = int(
+            (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
+        canvas_x = int(
+            (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
+        # canvas_channel = voxel_mean.size(1)
+        batch_size = pts_coors[-1, 0] + 1
+        canvas_len = canvas_z * canvas_y * canvas_x * batch_size
+        # Create the canvas for this sample
+        canvas = voxel_mean.new_zeros(canvas_len, dtype=torch.long)
+        # Only include non-empty pillars
+        indices = (
+            voxel_coors[:, 0] * canvas_z * canvas_y * canvas_x +
+            voxel_coors[:, 1] * canvas_y * canvas_x +
+            voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
+        # Scatter the blob back to the canvas
+        canvas[indices.long()] = torch.arange(
+            start=0, end=voxel_mean.size(0), device=voxel_mean.device)
+
+        # Step 2: get voxel mean for each point
+        voxel_index = (
+            pts_coors[:, 0] * canvas_z * canvas_y * canvas_x +
+            pts_coors[:, 1] * canvas_y * canvas_x +
+            pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
+        voxel_inds = canvas[voxel_index.long()]
+        center_per_point = voxel_mean[voxel_inds, ...]
+        return center_per_point
+
+    @force_fp32(out_fp16=True)
+    def forward(self,
+                features,
+                coors,
+                points=None,
+                img_feats=None,
+                img_metas=None):
+        """Forward functions.
+
+        Args:
+            features (torch.Tensor): Features of voxels, shape is NxC.
+            coors (torch.Tensor): Coordinates of voxels, shape is  Nx(1+NDim).
+            points (list[torch.Tensor], optional): Raw points used to guide the
+                multi-modality fusion. Defaults to None.
+            img_feats (list[torch.Tensor], optional): Image features used for
+                multi-modality fusion. Defaults to None.
+            img_metas (dict, optional): [description]. Defaults to None.
+
+        Returns:
+            tuple: If `return_point_feats` is False, returns voxel features and
+                its coordinates. If `return_point_feats` is True, returns
+                feature of each points inside voxels.
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            voxel_mean, mean_coors = self.cluster_scatter(features, coors)
+            points_mean = self.map_voxel_center_to_point(
+                coors, voxel_mean, mean_coors)
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :3] - points_mean[:, :3]
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(size=(features.size(0), 3))
+            f_center[:, 0] = features[:, 0] - (
+                coors[:, 3].type_as(features) * self.vx + self.x_offset)
+            f_center[:, 1] = features[:, 1] - (
+                coors[:, 2].type_as(features) * self.vy + self.y_offset)
+            f_center[:, 2] = features[:, 2] - (
+                coors[:, 1].type_as(features) * self.vz + self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        features = torch.cat(features_ls, dim=-1)
+        for i, vfe in enumerate(self.vfe_layers):
+            point_feats = vfe(features)
+            if (i == len(self.vfe_layers) - 1 and self.fusion_layer is not None
+                    and img_feats is not None):
+                point_feats = self.fusion_layer(img_feats, points, point_feats,
+                                                img_metas)
+            voxel_feats, voxel_coors = self.vfe_scatter(point_feats, coors)
+            if i != len(self.vfe_layers) - 1:
+                # need to concat voxel feats if it is not the last vfe
+                feat_per_point = self.map_voxel_center_to_point(
+                    coors, voxel_feats, voxel_coors)
+                features = torch.cat([point_feats, feat_per_point], dim=1)
+
+        if self.return_point_feats:
+            return point_feats
+        return voxel_feats, voxel_coors
+
+
+@VOXEL_ENCODERS.register_module()
+class HardVFE(nn.Module):
+    """Voxel feature encoder used in DV-SECOND.
+
+    It encodes features of voxels and their points. It could also fuse
+    image feature into voxel features in a point-wise manner.
+
+    Args:
+        in_channels (int, optional): Input channels of VFE. Defaults to 4.
+        feat_channels (list(int), optional): Channels of features in VFE.
+        with_distance (bool, optional): Whether to use the L2 distance
+            of points to the origin point. Defaults to False.
+        with_cluster_center (bool, optional): Whether to use the distance
+            to cluster center of points inside a voxel. Defaults to False.
+        with_voxel_center (bool, optional): Whether to use the distance to
+            center of voxel for each points inside a voxel. Defaults to False.
+        voxel_size (tuple[float], optional): Size of a single voxel.
+            Defaults to (0.2, 0.2, 4).
+        point_cloud_range (tuple[float], optional): The range of points
+            or voxels. Defaults to (0, -40, -3, 70.4, 40, 1).
+        norm_cfg (dict, optional): Config dict of normalization layers.
+        mode (str, optional): The mode when pooling features of points inside a
+            voxel. Available options include 'max' and 'avg'.
+            Defaults to 'max'.
+        fusion_layer (dict, optional): The config dict of fusion layer
+            used in multi-modal detectors. Defaults to None.
+        return_point_feats (bool, optional): Whether to return the
+            features of each points. Defaults to False.
+    """
+
+    def __init__(self,
+                 in_channels=4,
+                 feat_channels=[],
+                 with_distance=False,
+                 with_cluster_center=False,
+                 with_voxel_center=False,
+                 voxel_size=(0.2, 0.2, 4),
+                 point_cloud_range=(0, -40, -3, 70.4, 40, 1),
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 mode='max',
+                 fusion_layer=None,
+                 return_point_feats=False):
+        super(HardVFE, self).__init__()
+        assert len(feat_channels) > 0
+        if with_cluster_center:
+            in_channels += 3
+        if with_voxel_center:
+            in_channels += 3
+        if with_distance:
+            in_channels += 1
+        self.in_channels = in_channels
+        self._with_distance = with_distance
+        self._with_cluster_center = with_cluster_center
+        self._with_voxel_center = with_voxel_center
+        self.return_point_feats = return_point_feats
+        self.fp16_enabled = False
+
+        # Need pillar (voxel) size and x/y offset to calculate pillar offset
+        self.vx = voxel_size[0]
+        self.vy = voxel_size[1]
+        self.vz = voxel_size[2]
+        self.x_offset = self.vx / 2 + point_cloud_range[0]
+        self.y_offset = self.vy / 2 + point_cloud_range[1]
+        self.z_offset = self.vz / 2 + point_cloud_range[2]
+        self.point_cloud_range = point_cloud_range
+        self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
+
+        feat_channels = [self.in_channels] + list(feat_channels)
+        vfe_layers = []
+        for i in range(len(feat_channels) - 1):
+            in_filters = feat_channels[i]
+            out_filters = feat_channels[i + 1]
+            if i > 0:
+                in_filters *= 2
+            # TODO: pass norm_cfg to VFE
+            # norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
+            if i == (len(feat_channels) - 2):
+                cat_max = False
+                max_out = True
+                if fusion_layer:
+                    max_out = False
+            else:
+                max_out = True
+                cat_max = True
+            vfe_layers.append(
+                VFELayer(
+                    in_filters,
+                    out_filters,
+                    norm_cfg=norm_cfg,
+                    max_out=max_out,
+                    cat_max=cat_max))
+            self.vfe_layers = nn.ModuleList(vfe_layers)
+        self.num_vfe = len(vfe_layers)
+
+        self.fusion_layer = None
+        if fusion_layer is not None:
+            self.fusion_layer = builder.build_fusion_layer(fusion_layer)
+
+    @force_fp32(out_fp16=True)
+    def forward(self,
+                features,
+                num_points,
+                coors,
+                img_feats=None,
+                img_metas=None):
+        """Forward functions.
+
+        Args:
+            features (torch.Tensor): Features of voxels, shape is MxNxC.
+            num_points (torch.Tensor): Number of points in each voxel.
+            coors (torch.Tensor): Coordinates of voxels, shape is Mx(1+NDim).
+            img_feats (list[torch.Tensor], optional): Image features used for
+                multi-modality fusion. Defaults to None.
+            img_metas (dict, optional): [description]. Defaults to None.
+
+        Returns:
+            tuple: If `return_point_feats` is False, returns voxel features and
+                its coordinates. If `return_point_feats` is True, returns
+                feature of each points inside voxels.
+        """
+        features_ls = [features]
+        # Find distance of x, y, and z from cluster center
+        if self._with_cluster_center:
+            points_mean = (
+                features[:, :, :3].sum(dim=1, keepdim=True) /
+                num_points.type_as(features).view(-1, 1, 1))
+            # TODO: maybe also do cluster for reflectivity
+            f_cluster = features[:, :, :3] - points_mean
+            features_ls.append(f_cluster)
+
+        # Find distance of x, y, and z from pillar center
+        if self._with_voxel_center:
+            f_center = features.new_zeros(
+                size=(features.size(0), features.size(1), 3))
+            f_center[:, :, 0] = features[:, :, 0] - (
+                coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
+                self.x_offset)
+            f_center[:, :, 1] = features[:, :, 1] - (
+                coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
+                self.y_offset)
+            f_center[:, :, 2] = features[:, :, 2] - (
+                coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
+                self.z_offset)
+            features_ls.append(f_center)
+
+        if self._with_distance:
+            points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
+            features_ls.append(points_dist)
+
+        # Combine together feature decorations
+        voxel_feats = torch.cat(features_ls, dim=-1)
+        # The feature decorations were calculated without regard to whether
+        # pillar was empty.
+        # Need to ensure that empty voxels remain set to zeros.
+        voxel_count = voxel_feats.shape[1]
+        mask = get_paddings_indicator(num_points, voxel_count, axis=0)
+        voxel_feats *= mask.unsqueeze(-1).type_as(voxel_feats)
+
+        for i, vfe in enumerate(self.vfe_layers):
+            voxel_feats = vfe(voxel_feats)
+
+        if (self.fusion_layer is not None and img_feats is not None):
+            voxel_feats = self.fusion_with_mask(features, mask, voxel_feats,
+                                                coors, img_feats, img_metas)
+
+        return voxel_feats
+
+    def fusion_with_mask(self, features, mask, voxel_feats, coors, img_feats,
+                         img_metas):
+        """Fuse image and point features with mask.
+
+        Args:
+            features (torch.Tensor): Features of voxel, usually it is the
+                values of points in voxels.
+            mask (torch.Tensor): Mask indicates valid features in each voxel.
+            voxel_feats (torch.Tensor): Features of voxels.
+            coors (torch.Tensor): Coordinates of each single voxel.
+            img_feats (list[torch.Tensor]): Multi-scale feature maps of image.
+            img_metas (list(dict)): Meta information of image and points.
+
+        Returns:
+            torch.Tensor: Fused features of each voxel.
+        """
+        # the features is consist of a batch of points
+        batch_size = coors[-1, 0] + 1
+        points = []
+        for i in range(batch_size):
+            single_mask = (coors[:, 0] == i)
+            points.append(features[single_mask][mask[single_mask]])
+
+        point_feats = voxel_feats[mask]
+        point_feats = self.fusion_layer(img_feats, points, point_feats,
+                                        img_metas)
+
+        voxel_canvas = voxel_feats.new_zeros(
+            size=(voxel_feats.size(0), voxel_feats.size(1),
+                  point_feats.size(-1)))
+        voxel_canvas[mask] = point_feats
+        out = torch.max(voxel_canvas, dim=1)[0]
+
+        return out
diff --git a/mmdet3d/ops/__init__.py b/mmdet3d/ops/__init__.py
index 88fb74d..8e9fd36 100644
--- a/mmdet3d/ops/__init__.py
+++ b/mmdet3d/ops/__init__.py
@@ -1,48 +1,48 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.ops import (RoIAlign, SigmoidFocalLoss, get_compiler_version,
-                      get_compiling_cuda_version, nms, roi_align,
-                      sigmoid_focal_loss)
-from mmcv.ops.assign_score_withk import assign_score_withk
-from mmcv.ops.ball_query import ball_query
-from mmcv.ops.furthest_point_sample import (furthest_point_sample,
-                                            furthest_point_sample_with_dist)
-from mmcv.ops.gather_points import gather_points
-from mmcv.ops.group_points import GroupAll, QueryAndGroup, grouping_operation
-from mmcv.ops.knn import knn
-from mmcv.ops.points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
-                                      points_in_boxes_part)
-from mmcv.ops.points_sampler import PointsSampler as Points_Sampler
-from mmcv.ops.roiaware_pool3d import RoIAwarePool3d
-from mmcv.ops.roipoint_pool3d import RoIPointPool3d
-from mmcv.ops.scatter_points import DynamicScatter, dynamic_scatter
-from mmcv.ops.three_interpolate import three_interpolate
-from mmcv.ops.three_nn import three_nn
-from mmcv.ops.voxelize import Voxelization, voxelization
-
-from .dgcnn_modules import DGCNNFAModule, DGCNNFPModule, DGCNNGFModule
-from .norm import NaiveSyncBatchNorm1d, NaiveSyncBatchNorm2d
-from .paconv import PAConv, PAConvCUDA
-from .pointnet_modules import (PAConvCUDASAModule, PAConvCUDASAModuleMSG,
-                               PAConvSAModule, PAConvSAModuleMSG,
-                               PointFPModule, PointSAModule, PointSAModuleMSG,
-                               build_sa_module)
-from .sparse_block import (SparseBasicBlock, SparseBottleneck,
-                           make_sparse_convmodule)
-
-__all__ = [
-    'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'get_compiler_version',
-    'get_compiling_cuda_version', 'NaiveSyncBatchNorm1d',
-    'NaiveSyncBatchNorm2d', 'batched_nms', 'Voxelization', 'voxelization',
-    'dynamic_scatter', 'DynamicScatter', 'sigmoid_focal_loss',
-    'SigmoidFocalLoss', 'SparseBasicBlock', 'SparseBottleneck',
-    'RoIAwarePool3d', 'points_in_boxes_part', 'points_in_boxes_cpu',
-    'make_sparse_convmodule', 'ball_query', 'knn', 'furthest_point_sample',
-    'furthest_point_sample_with_dist', 'three_interpolate', 'three_nn',
-    'gather_points', 'grouping_operation', 'GroupAll', 'QueryAndGroup',
-    'PointSAModule', 'PointSAModuleMSG', 'PointFPModule', 'DGCNNFPModule',
-    'DGCNNGFModule', 'DGCNNFAModule', 'points_in_boxes_all',
-    'get_compiler_version', 'assign_score_withk', 'get_compiling_cuda_version',
-    'Points_Sampler', 'build_sa_module', 'PAConv', 'PAConvCUDA',
-    'PAConvSAModuleMSG', 'PAConvSAModule', 'PAConvCUDASAModule',
-    'PAConvCUDASAModuleMSG', 'RoIPointPool3d'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.ops import (RoIAlign, SigmoidFocalLoss, get_compiler_version,
+                      get_compiling_cuda_version, nms, roi_align,
+                      sigmoid_focal_loss)
+from mmcv.ops.assign_score_withk import assign_score_withk
+from mmcv.ops.ball_query import ball_query
+from mmcv.ops.furthest_point_sample import (furthest_point_sample,
+                                            furthest_point_sample_with_dist)
+from mmcv.ops.gather_points import gather_points
+from mmcv.ops.group_points import GroupAll, QueryAndGroup, grouping_operation
+from mmcv.ops.knn import knn
+from mmcv.ops.points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
+                                      points_in_boxes_part)
+from mmcv.ops.points_sampler import PointsSampler as Points_Sampler
+from mmcv.ops.roiaware_pool3d import RoIAwarePool3d
+from mmcv.ops.roipoint_pool3d import RoIPointPool3d
+from mmcv.ops.scatter_points import DynamicScatter, dynamic_scatter
+from mmcv.ops.three_interpolate import three_interpolate
+from mmcv.ops.three_nn import three_nn
+from mmcv.ops.voxelize import Voxelization, voxelization
+
+from .dgcnn_modules import DGCNNFAModule, DGCNNFPModule, DGCNNGFModule
+from .norm import NaiveSyncBatchNorm1d, NaiveSyncBatchNorm2d
+from .paconv import PAConv, PAConvCUDA
+from .pointnet_modules import (PAConvCUDASAModule, PAConvCUDASAModuleMSG,
+                               PAConvSAModule, PAConvSAModuleMSG,
+                               PointFPModule, PointSAModule, PointSAModuleMSG,
+                               build_sa_module)
+from .sparse_block import (SparseBasicBlock, SparseBottleneck,
+                           make_sparse_convmodule)
+
+__all__ = [
+    'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'get_compiler_version',
+    'get_compiling_cuda_version', 'NaiveSyncBatchNorm1d',
+    'NaiveSyncBatchNorm2d', 'batched_nms', 'Voxelization', 'voxelization',
+    'dynamic_scatter', 'DynamicScatter', 'sigmoid_focal_loss',
+    'SigmoidFocalLoss', 'SparseBasicBlock', 'SparseBottleneck',
+    'RoIAwarePool3d', 'points_in_boxes_part', 'points_in_boxes_cpu',
+    'make_sparse_convmodule', 'ball_query', 'knn', 'furthest_point_sample',
+    'furthest_point_sample_with_dist', 'three_interpolate', 'three_nn',
+    'gather_points', 'grouping_operation', 'GroupAll', 'QueryAndGroup',
+    'PointSAModule', 'PointSAModuleMSG', 'PointFPModule', 'DGCNNFPModule',
+    'DGCNNGFModule', 'DGCNNFAModule', 'points_in_boxes_all',
+    'get_compiler_version', 'assign_score_withk', 'get_compiling_cuda_version',
+    'Points_Sampler', 'build_sa_module', 'PAConv', 'PAConvCUDA',
+    'PAConvSAModuleMSG', 'PAConvSAModule', 'PAConvCUDASAModule',
+    'PAConvCUDASAModuleMSG', 'RoIPointPool3d'
+]
diff --git a/mmdet3d/ops/dgcnn_modules/__init__.py b/mmdet3d/ops/dgcnn_modules/__init__.py
index 67beb09..5fce4a7 100644
--- a/mmdet3d/ops/dgcnn_modules/__init__.py
+++ b/mmdet3d/ops/dgcnn_modules/__init__.py
@@ -1,6 +1,6 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .dgcnn_fa_module import DGCNNFAModule
-from .dgcnn_fp_module import DGCNNFPModule
-from .dgcnn_gf_module import DGCNNGFModule
-
-__all__ = ['DGCNNFAModule', 'DGCNNFPModule', 'DGCNNGFModule']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dgcnn_fa_module import DGCNNFAModule
+from .dgcnn_fp_module import DGCNNFPModule
+from .dgcnn_gf_module import DGCNNGFModule
+
+__all__ = ['DGCNNFAModule', 'DGCNNFPModule', 'DGCNNGFModule']
diff --git a/mmdet3d/ops/dgcnn_modules/dgcnn_fa_module.py b/mmdet3d/ops/dgcnn_modules/dgcnn_fa_module.py
index b0975e6..1e2c6eb 100644
--- a/mmdet3d/ops/dgcnn_modules/dgcnn_fa_module.py
+++ b/mmdet3d/ops/dgcnn_modules/dgcnn_fa_module.py
@@ -1,68 +1,68 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule, force_fp32
-from torch import nn as nn
-
-
-class DGCNNFAModule(BaseModule):
-    """Point feature aggregation module used in DGCNN.
-
-    Aggregate all the features of points.
-
-    Args:
-        mlp_channels (list[int]): List of mlp channels.
-        norm_cfg (dict, optional): Type of normalization method.
-            Defaults to dict(type='BN1d').
-        act_cfg (dict, optional): Type of activation method.
-            Defaults to dict(type='ReLU').
-        init_cfg (dict, optional): Initialization config. Defaults to None.
-    """
-
-    def __init__(self,
-                 mlp_channels,
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.fp16_enabled = False
-        self.mlps = nn.Sequential()
-        for i in range(len(mlp_channels) - 1):
-            self.mlps.add_module(
-                f'layer{i}',
-                ConvModule(
-                    mlp_channels[i],
-                    mlp_channels[i + 1],
-                    kernel_size=(1, ),
-                    stride=(1, ),
-                    conv_cfg=dict(type='Conv1d'),
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg))
-
-    @force_fp32()
-    def forward(self, points):
-        """forward.
-
-        Args:
-            points (List[Tensor]): tensor of the features to be aggregated.
-
-        Returns:
-            Tensor: (B, N, M) M = mlp[-1], tensor of the output points.
-        """
-
-        if len(points) > 1:
-            new_points = torch.cat(points[1:], dim=-1)
-            new_points = new_points.transpose(1, 2).contiguous()  # (B, C, N)
-            new_points_copy = new_points
-
-            new_points = self.mlps(new_points)
-
-            new_fa_points = new_points.max(dim=-1, keepdim=True)[0]
-            new_fa_points = new_fa_points.repeat(1, 1, new_points.shape[-1])
-
-            new_points = torch.cat([new_fa_points, new_points_copy], dim=1)
-            new_points = new_points.transpose(1, 2).contiguous()
-        else:
-            new_points = points
-
-        return new_points
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, force_fp32
+from torch import nn as nn
+
+
+class DGCNNFAModule(BaseModule):
+    """Point feature aggregation module used in DGCNN.
+
+    Aggregate all the features of points.
+
+    Args:
+        mlp_channels (list[int]): List of mlp channels.
+        norm_cfg (dict, optional): Type of normalization method.
+            Defaults to dict(type='BN1d').
+        act_cfg (dict, optional): Type of activation method.
+            Defaults to dict(type='ReLU').
+        init_cfg (dict, optional): Initialization config. Defaults to None.
+    """
+
+    def __init__(self,
+                 mlp_channels,
+                 norm_cfg=dict(type='BN1d'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.fp16_enabled = False
+        self.mlps = nn.Sequential()
+        for i in range(len(mlp_channels) - 1):
+            self.mlps.add_module(
+                f'layer{i}',
+                ConvModule(
+                    mlp_channels[i],
+                    mlp_channels[i + 1],
+                    kernel_size=(1, ),
+                    stride=(1, ),
+                    conv_cfg=dict(type='Conv1d'),
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    @force_fp32()
+    def forward(self, points):
+        """forward.
+
+        Args:
+            points (List[Tensor]): tensor of the features to be aggregated.
+
+        Returns:
+            Tensor: (B, N, M) M = mlp[-1], tensor of the output points.
+        """
+
+        if len(points) > 1:
+            new_points = torch.cat(points[1:], dim=-1)
+            new_points = new_points.transpose(1, 2).contiguous()  # (B, C, N)
+            new_points_copy = new_points
+
+            new_points = self.mlps(new_points)
+
+            new_fa_points = new_points.max(dim=-1, keepdim=True)[0]
+            new_fa_points = new_fa_points.repeat(1, 1, new_points.shape[-1])
+
+            new_points = torch.cat([new_fa_points, new_points_copy], dim=1)
+            new_points = new_points.transpose(1, 2).contiguous()
+        else:
+            new_points = points
+
+        return new_points
diff --git a/mmdet3d/ops/dgcnn_modules/dgcnn_fp_module.py b/mmdet3d/ops/dgcnn_modules/dgcnn_fp_module.py
index c871721..eb047ac 100644
--- a/mmdet3d/ops/dgcnn_modules/dgcnn_fp_module.py
+++ b/mmdet3d/ops/dgcnn_modules/dgcnn_fp_module.py
@@ -1,59 +1,59 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule, force_fp32
-from torch import nn as nn
-
-
-class DGCNNFPModule(BaseModule):
-    """Point feature propagation module used in DGCNN.
-
-    Propagate the features from one set to another.
-
-    Args:
-        mlp_channels (list[int]): List of mlp channels.
-        norm_cfg (dict, optional): Type of activation method.
-            Defaults to dict(type='BN1d').
-        act_cfg (dict, optional): Type of activation method.
-            Defaults to dict(type='ReLU').
-        init_cfg (dict, optional): Initialization config. Defaults to None.
-    """
-
-    def __init__(self,
-                 mlp_channels,
-                 norm_cfg=dict(type='BN1d'),
-                 act_cfg=dict(type='ReLU'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.fp16_enabled = False
-        self.mlps = nn.Sequential()
-        for i in range(len(mlp_channels) - 1):
-            self.mlps.add_module(
-                f'layer{i}',
-                ConvModule(
-                    mlp_channels[i],
-                    mlp_channels[i + 1],
-                    kernel_size=(1, ),
-                    stride=(1, ),
-                    conv_cfg=dict(type='Conv1d'),
-                    norm_cfg=norm_cfg,
-                    act_cfg=act_cfg))
-
-    @force_fp32()
-    def forward(self, points):
-        """forward.
-
-        Args:
-            points (Tensor): (B, N, C) tensor of the input points.
-
-        Returns:
-            Tensor: (B, N, M) M = mlp[-1], tensor of the new points.
-        """
-
-        if points is not None:
-            new_points = points.transpose(1, 2).contiguous()  # (B, C, N)
-            new_points = self.mlps(new_points)
-            new_points = new_points.transpose(1, 2).contiguous()
-        else:
-            new_points = points
-
-        return new_points
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, force_fp32
+from torch import nn as nn
+
+
+class DGCNNFPModule(BaseModule):
+    """Point feature propagation module used in DGCNN.
+
+    Propagate the features from one set to another.
+
+    Args:
+        mlp_channels (list[int]): List of mlp channels.
+        norm_cfg (dict, optional): Type of activation method.
+            Defaults to dict(type='BN1d').
+        act_cfg (dict, optional): Type of activation method.
+            Defaults to dict(type='ReLU').
+        init_cfg (dict, optional): Initialization config. Defaults to None.
+    """
+
+    def __init__(self,
+                 mlp_channels,
+                 norm_cfg=dict(type='BN1d'),
+                 act_cfg=dict(type='ReLU'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.fp16_enabled = False
+        self.mlps = nn.Sequential()
+        for i in range(len(mlp_channels) - 1):
+            self.mlps.add_module(
+                f'layer{i}',
+                ConvModule(
+                    mlp_channels[i],
+                    mlp_channels[i + 1],
+                    kernel_size=(1, ),
+                    stride=(1, ),
+                    conv_cfg=dict(type='Conv1d'),
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg))
+
+    @force_fp32()
+    def forward(self, points):
+        """forward.
+
+        Args:
+            points (Tensor): (B, N, C) tensor of the input points.
+
+        Returns:
+            Tensor: (B, N, M) M = mlp[-1], tensor of the new points.
+        """
+
+        if points is not None:
+            new_points = points.transpose(1, 2).contiguous()  # (B, C, N)
+            new_points = self.mlps(new_points)
+            new_points = new_points.transpose(1, 2).contiguous()
+        else:
+            new_points = points
+
+        return new_points
diff --git a/mmdet3d/ops/dgcnn_modules/dgcnn_gf_module.py b/mmdet3d/ops/dgcnn_modules/dgcnn_gf_module.py
index 96785e7..224fb46 100644
--- a/mmdet3d/ops/dgcnn_modules/dgcnn_gf_module.py
+++ b/mmdet3d/ops/dgcnn_modules/dgcnn_gf_module.py
@@ -1,221 +1,221 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import ConvModule
-from mmcv.ops.group_points import GroupAll, QueryAndGroup, grouping_operation
-from torch import nn as nn
-from torch.nn import functional as F
-
-
-class BaseDGCNNGFModule(nn.Module):
-    """Base module for point graph feature module used in DGCNN.
-
-    Args:
-        radii (list[float]): List of radius in each knn or ball query.
-        sample_nums (list[int]): Number of samples in each knn or ball query.
-        mlp_channels (list[list[int]]): Specify of the dgcnn before
-            the global pooling for each graph feature module.
-        knn_modes (list[str], optional): Type of KNN method, valid mode
-            ['F-KNN', 'D-KNN'], Defaults to ['F-KNN'].
-        dilated_group (bool, optional): Whether to use dilated ball query.
-            Defaults to False.
-        use_xyz (bool, optional): Whether to use xyz as point features.
-            Defaults to True.
-        pool_mode (str, optional): Type of pooling method. Defaults to 'max'.
-        normalize_xyz (bool, optional): If ball query, whether to normalize
-            local XYZ with radius. Defaults to False.
-        grouper_return_grouped_xyz (bool, optional): Whether to return grouped
-            xyz in `QueryAndGroup`. Defaults to False.
-        grouper_return_grouped_idx (bool, optional): Whether to return grouped
-            idx in `QueryAndGroup`. Defaults to False.
-    """
-
-    def __init__(self,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 knn_modes=['F-KNN'],
-                 dilated_group=False,
-                 use_xyz=True,
-                 pool_mode='max',
-                 normalize_xyz=False,
-                 grouper_return_grouped_xyz=False,
-                 grouper_return_grouped_idx=False):
-        super(BaseDGCNNGFModule, self).__init__()
-
-        assert len(sample_nums) == len(
-            mlp_channels
-        ), 'Num_samples and mlp_channels should have the same length.'
-        assert pool_mode in ['max', 'avg'
-                             ], "Pool_mode should be one of ['max', 'avg']."
-        assert isinstance(knn_modes, list) or isinstance(
-            knn_modes, tuple), 'The type of knn_modes should be list or tuple.'
-
-        if isinstance(mlp_channels, tuple):
-            mlp_channels = list(map(list, mlp_channels))
-        self.mlp_channels = mlp_channels
-
-        self.pool_mode = pool_mode
-        self.groupers = nn.ModuleList()
-        self.mlps = nn.ModuleList()
-        self.knn_modes = knn_modes
-
-        for i in range(len(sample_nums)):
-            sample_num = sample_nums[i]
-            if sample_num is not None:
-                if self.knn_modes[i] == 'D-KNN':
-                    grouper = QueryAndGroup(
-                        radii[i],
-                        sample_num,
-                        use_xyz=use_xyz,
-                        normalize_xyz=normalize_xyz,
-                        return_grouped_xyz=grouper_return_grouped_xyz,
-                        return_grouped_idx=True)
-                else:
-                    grouper = QueryAndGroup(
-                        radii[i],
-                        sample_num,
-                        use_xyz=use_xyz,
-                        normalize_xyz=normalize_xyz,
-                        return_grouped_xyz=grouper_return_grouped_xyz,
-                        return_grouped_idx=grouper_return_grouped_idx)
-            else:
-                grouper = GroupAll(use_xyz)
-            self.groupers.append(grouper)
-
-    def _pool_features(self, features):
-        """Perform feature aggregation using pooling operation.
-
-        Args:
-            features (torch.Tensor): (B, C, N, K)
-                Features of locally grouped points before pooling.
-
-        Returns:
-            torch.Tensor: (B, C, N)
-                Pooled features aggregating local information.
-        """
-        if self.pool_mode == 'max':
-            # (B, C, N, 1)
-            new_features = F.max_pool2d(
-                features, kernel_size=[1, features.size(3)])
-        elif self.pool_mode == 'avg':
-            # (B, C, N, 1)
-            new_features = F.avg_pool2d(
-                features, kernel_size=[1, features.size(3)])
-        else:
-            raise NotImplementedError
-
-        return new_features.squeeze(-1).contiguous()
-
-    def forward(self, points):
-        """forward.
-
-        Args:
-            points (Tensor): (B, N, C) input points.
-
-        Returns:
-            List[Tensor]: (B, N, C1) new points generated from each graph
-                feature module.
-        """
-        new_points_list = [points]
-
-        for i in range(len(self.groupers)):
-
-            new_points = new_points_list[i]
-            new_points_trans = new_points.transpose(
-                1, 2).contiguous()  # (B, C, N)
-
-            if self.knn_modes[i] == 'D-KNN':
-                # (B, N, C) -> (B, N, K)
-                idx = self.groupers[i](new_points[..., -3:].contiguous(),
-                                       new_points[..., -3:].contiguous())[-1]
-
-                grouped_results = grouping_operation(
-                    new_points_trans, idx)  # (B, C, N) -> (B, C, N, K)
-                grouped_results -= new_points_trans.unsqueeze(-1)
-            else:
-                grouped_results = self.groupers[i](
-                    new_points, new_points)  # (B, N, C) -> (B, C, N, K)
-
-            new_points = new_points_trans.unsqueeze(-1).repeat(
-                1, 1, 1, grouped_results.shape[-1])
-            new_points = torch.cat([grouped_results, new_points], dim=1)
-
-            # (B, mlp[-1], N, K)
-            new_points = self.mlps[i](new_points)
-
-            # (B, mlp[-1], N)
-            new_points = self._pool_features(new_points)
-            new_points = new_points.transpose(1, 2).contiguous()
-            new_points_list.append(new_points)
-
-        return new_points
-
-
-class DGCNNGFModule(BaseDGCNNGFModule):
-    """Point graph feature module used in DGCNN.
-
-    Args:
-        mlp_channels (list[int]): Specify of the dgcnn before
-            the global pooling for each graph feature module.
-        num_sample (int, optional): Number of samples in each knn or ball
-            query. Defaults to None.
-        knn_mode (str, optional): Type of KNN method, valid mode
-            ['F-KNN', 'D-KNN']. Defaults to 'F-KNN'.
-        radius (float, optional): Radius to group with.
-            Defaults to None.
-        dilated_group (bool, optional): Whether to use dilated ball query.
-            Defaults to False.
-        norm_cfg (dict, optional): Type of normalization method.
-            Defaults to dict(type='BN2d').
-        act_cfg (dict, optional): Type of activation method.
-            Defaults to dict(type='ReLU').
-        use_xyz (bool, optional): Whether to use xyz as point features.
-            Defaults to True.
-        pool_mode (str, optional): Type of pooling method.
-            Defaults to 'max'.
-        normalize_xyz (bool, optional): If ball query, whether to normalize
-            local XYZ with radius. Defaults to False.
-        bias (bool | str, optional): If specified as `auto`, it will be decided
-            by the norm_cfg. Bias will be set as True if `norm_cfg` is None,
-            otherwise False. Defaults to 'auto'.
-    """
-
-    def __init__(self,
-                 mlp_channels,
-                 num_sample=None,
-                 knn_mode='F-KNN',
-                 radius=None,
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d'),
-                 act_cfg=dict(type='ReLU'),
-                 use_xyz=True,
-                 pool_mode='max',
-                 normalize_xyz=False,
-                 bias='auto'):
-        super(DGCNNGFModule, self).__init__(
-            mlp_channels=[mlp_channels],
-            sample_nums=[num_sample],
-            knn_modes=[knn_mode],
-            radii=[radius],
-            use_xyz=use_xyz,
-            pool_mode=pool_mode,
-            normalize_xyz=normalize_xyz,
-            dilated_group=dilated_group)
-
-        for i in range(len(self.mlp_channels)):
-            mlp_channel = self.mlp_channels[i]
-
-            mlp = nn.Sequential()
-            for i in range(len(mlp_channel) - 1):
-                mlp.add_module(
-                    f'layer{i}',
-                    ConvModule(
-                        mlp_channel[i],
-                        mlp_channel[i + 1],
-                        kernel_size=(1, 1),
-                        stride=(1, 1),
-                        conv_cfg=dict(type='Conv2d'),
-                        norm_cfg=norm_cfg,
-                        act_cfg=act_cfg,
-                        bias=bias))
-            self.mlps.append(mlp)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.ops.group_points import GroupAll, QueryAndGroup, grouping_operation
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+class BaseDGCNNGFModule(nn.Module):
+    """Base module for point graph feature module used in DGCNN.
+
+    Args:
+        radii (list[float]): List of radius in each knn or ball query.
+        sample_nums (list[int]): Number of samples in each knn or ball query.
+        mlp_channels (list[list[int]]): Specify of the dgcnn before
+            the global pooling for each graph feature module.
+        knn_modes (list[str], optional): Type of KNN method, valid mode
+            ['F-KNN', 'D-KNN'], Defaults to ['F-KNN'].
+        dilated_group (bool, optional): Whether to use dilated ball query.
+            Defaults to False.
+        use_xyz (bool, optional): Whether to use xyz as point features.
+            Defaults to True.
+        pool_mode (str, optional): Type of pooling method. Defaults to 'max'.
+        normalize_xyz (bool, optional): If ball query, whether to normalize
+            local XYZ with radius. Defaults to False.
+        grouper_return_grouped_xyz (bool, optional): Whether to return grouped
+            xyz in `QueryAndGroup`. Defaults to False.
+        grouper_return_grouped_idx (bool, optional): Whether to return grouped
+            idx in `QueryAndGroup`. Defaults to False.
+    """
+
+    def __init__(self,
+                 radii,
+                 sample_nums,
+                 mlp_channels,
+                 knn_modes=['F-KNN'],
+                 dilated_group=False,
+                 use_xyz=True,
+                 pool_mode='max',
+                 normalize_xyz=False,
+                 grouper_return_grouped_xyz=False,
+                 grouper_return_grouped_idx=False):
+        super(BaseDGCNNGFModule, self).__init__()
+
+        assert len(sample_nums) == len(
+            mlp_channels
+        ), 'Num_samples and mlp_channels should have the same length.'
+        assert pool_mode in ['max', 'avg'
+                             ], "Pool_mode should be one of ['max', 'avg']."
+        assert isinstance(knn_modes, list) or isinstance(
+            knn_modes, tuple), 'The type of knn_modes should be list or tuple.'
+
+        if isinstance(mlp_channels, tuple):
+            mlp_channels = list(map(list, mlp_channels))
+        self.mlp_channels = mlp_channels
+
+        self.pool_mode = pool_mode
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        self.knn_modes = knn_modes
+
+        for i in range(len(sample_nums)):
+            sample_num = sample_nums[i]
+            if sample_num is not None:
+                if self.knn_modes[i] == 'D-KNN':
+                    grouper = QueryAndGroup(
+                        radii[i],
+                        sample_num,
+                        use_xyz=use_xyz,
+                        normalize_xyz=normalize_xyz,
+                        return_grouped_xyz=grouper_return_grouped_xyz,
+                        return_grouped_idx=True)
+                else:
+                    grouper = QueryAndGroup(
+                        radii[i],
+                        sample_num,
+                        use_xyz=use_xyz,
+                        normalize_xyz=normalize_xyz,
+                        return_grouped_xyz=grouper_return_grouped_xyz,
+                        return_grouped_idx=grouper_return_grouped_idx)
+            else:
+                grouper = GroupAll(use_xyz)
+            self.groupers.append(grouper)
+
+    def _pool_features(self, features):
+        """Perform feature aggregation using pooling operation.
+
+        Args:
+            features (torch.Tensor): (B, C, N, K)
+                Features of locally grouped points before pooling.
+
+        Returns:
+            torch.Tensor: (B, C, N)
+                Pooled features aggregating local information.
+        """
+        if self.pool_mode == 'max':
+            # (B, C, N, 1)
+            new_features = F.max_pool2d(
+                features, kernel_size=[1, features.size(3)])
+        elif self.pool_mode == 'avg':
+            # (B, C, N, 1)
+            new_features = F.avg_pool2d(
+                features, kernel_size=[1, features.size(3)])
+        else:
+            raise NotImplementedError
+
+        return new_features.squeeze(-1).contiguous()
+
+    def forward(self, points):
+        """forward.
+
+        Args:
+            points (Tensor): (B, N, C) input points.
+
+        Returns:
+            List[Tensor]: (B, N, C1) new points generated from each graph
+                feature module.
+        """
+        new_points_list = [points]
+
+        for i in range(len(self.groupers)):
+
+            new_points = new_points_list[i]
+            new_points_trans = new_points.transpose(
+                1, 2).contiguous()  # (B, C, N)
+
+            if self.knn_modes[i] == 'D-KNN':
+                # (B, N, C) -> (B, N, K)
+                idx = self.groupers[i](new_points[..., -3:].contiguous(),
+                                       new_points[..., -3:].contiguous())[-1]
+
+                grouped_results = grouping_operation(
+                    new_points_trans, idx)  # (B, C, N) -> (B, C, N, K)
+                grouped_results -= new_points_trans.unsqueeze(-1)
+            else:
+                grouped_results = self.groupers[i](
+                    new_points, new_points)  # (B, N, C) -> (B, C, N, K)
+
+            new_points = new_points_trans.unsqueeze(-1).repeat(
+                1, 1, 1, grouped_results.shape[-1])
+            new_points = torch.cat([grouped_results, new_points], dim=1)
+
+            # (B, mlp[-1], N, K)
+            new_points = self.mlps[i](new_points)
+
+            # (B, mlp[-1], N)
+            new_points = self._pool_features(new_points)
+            new_points = new_points.transpose(1, 2).contiguous()
+            new_points_list.append(new_points)
+
+        return new_points
+
+
+class DGCNNGFModule(BaseDGCNNGFModule):
+    """Point graph feature module used in DGCNN.
+
+    Args:
+        mlp_channels (list[int]): Specify of the dgcnn before
+            the global pooling for each graph feature module.
+        num_sample (int, optional): Number of samples in each knn or ball
+            query. Defaults to None.
+        knn_mode (str, optional): Type of KNN method, valid mode
+            ['F-KNN', 'D-KNN']. Defaults to 'F-KNN'.
+        radius (float, optional): Radius to group with.
+            Defaults to None.
+        dilated_group (bool, optional): Whether to use dilated ball query.
+            Defaults to False.
+        norm_cfg (dict, optional): Type of normalization method.
+            Defaults to dict(type='BN2d').
+        act_cfg (dict, optional): Type of activation method.
+            Defaults to dict(type='ReLU').
+        use_xyz (bool, optional): Whether to use xyz as point features.
+            Defaults to True.
+        pool_mode (str, optional): Type of pooling method.
+            Defaults to 'max'.
+        normalize_xyz (bool, optional): If ball query, whether to normalize
+            local XYZ with radius. Defaults to False.
+        bias (bool | str, optional): If specified as `auto`, it will be decided
+            by the norm_cfg. Bias will be set as True if `norm_cfg` is None,
+            otherwise False. Defaults to 'auto'.
+    """
+
+    def __init__(self,
+                 mlp_channels,
+                 num_sample=None,
+                 knn_mode='F-KNN',
+                 radius=None,
+                 dilated_group=False,
+                 norm_cfg=dict(type='BN2d'),
+                 act_cfg=dict(type='ReLU'),
+                 use_xyz=True,
+                 pool_mode='max',
+                 normalize_xyz=False,
+                 bias='auto'):
+        super(DGCNNGFModule, self).__init__(
+            mlp_channels=[mlp_channels],
+            sample_nums=[num_sample],
+            knn_modes=[knn_mode],
+            radii=[radius],
+            use_xyz=use_xyz,
+            pool_mode=pool_mode,
+            normalize_xyz=normalize_xyz,
+            dilated_group=dilated_group)
+
+        for i in range(len(self.mlp_channels)):
+            mlp_channel = self.mlp_channels[i]
+
+            mlp = nn.Sequential()
+            for i in range(len(mlp_channel) - 1):
+                mlp.add_module(
+                    f'layer{i}',
+                    ConvModule(
+                        mlp_channel[i],
+                        mlp_channel[i + 1],
+                        kernel_size=(1, 1),
+                        stride=(1, 1),
+                        conv_cfg=dict(type='Conv2d'),
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                        bias=bias))
+            self.mlps.append(mlp)
diff --git a/mmdet3d/ops/norm.py b/mmdet3d/ops/norm.py
index 98ec7f1..4e60e5a 100644
--- a/mmdet3d/ops/norm.py
+++ b/mmdet3d/ops/norm.py
@@ -1,163 +1,163 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import NORM_LAYERS
-from mmcv.runner import force_fp32
-from torch import distributed as dist
-from torch import nn as nn
-from torch.autograd.function import Function
-
-
-class AllReduce(Function):
-
-    @staticmethod
-    def forward(ctx, input):
-        input_list = [
-            torch.zeros_like(input) for k in range(dist.get_world_size())
-        ]
-        # Use allgather instead of allreduce in-place operations is unreliable
-        dist.all_gather(input_list, input, async_op=False)
-        inputs = torch.stack(input_list, dim=0)
-        return torch.sum(inputs, dim=0)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        dist.all_reduce(grad_output, async_op=False)
-        return grad_output
-
-
-@NORM_LAYERS.register_module('naiveSyncBN1d')
-class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
-    """Synchronized Batch Normalization for 3D Tensors.
-
-    Note:
-        This implementation is modified from
-        https://github.com/facebookresearch/detectron2/
-
-        `torch.nn.SyncBatchNorm` has known unknown bugs.
-        It produces significantly worse AP (and sometimes goes NaN)
-        when the batch size on each worker is quite different
-        (e.g., when scale augmentation is used).
-        In 3D detection, different workers has points of different shapes,
-        which also cause instability.
-
-        Use this implementation before `nn.SyncBatchNorm` is fixed.
-        It is slower than `nn.SyncBatchNorm`.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.fp16_enabled = False
-
-    # customized normalization layer still needs this decorator
-    # to force the input to be fp32 and the output to be fp16
-    # TODO: make mmcv fp16 utils handle customized norm layers
-    @force_fp32(out_fp16=True)
-    def forward(self, input):
-        """
-        Args:
-            input (tensor): Has shape (N, C) or (N, C, L), where N is
-                the batch size, C is the number of features or
-                channels, and L is the sequence length
-
-        Returns:
-            tensor: Has shape (N, C) or (N, C, L), has same shape
-            as input.
-        """
-        assert input.dtype == torch.float32, \
-            f'input should be in float32 type, got {input.dtype}'
-        using_dist = dist.is_available() and dist.is_initialized()
-        if (not using_dist) or dist.get_world_size() == 1 \
-                or not self.training:
-            return super().forward(input)
-        assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
-        is_two_dim = input.dim() == 2
-        if is_two_dim:
-            input = input.unsqueeze(2)
-
-        C = input.shape[1]
-        mean = torch.mean(input, dim=[0, 2])
-        meansqr = torch.mean(input * input, dim=[0, 2])
-
-        vec = torch.cat([mean, meansqr], dim=0)
-        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
-
-        mean, meansqr = torch.split(vec, C)
-        var = meansqr - mean * mean
-        self.running_mean += self.momentum * (
-            mean.detach() - self.running_mean)
-        self.running_var += self.momentum * (var.detach() - self.running_var)
-
-        invstd = torch.rsqrt(var + self.eps)
-        scale = self.weight * invstd
-        bias = self.bias - mean * scale
-        scale = scale.reshape(1, -1, 1)
-        bias = bias.reshape(1, -1, 1)
-        output = input * scale + bias
-        if is_two_dim:
-            output = output.squeeze(2)
-        return output
-
-
-@NORM_LAYERS.register_module('naiveSyncBN2d')
-class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
-    """Synchronized Batch Normalization for 4D Tensors.
-
-    Note:
-        This implementation is modified from
-        https://github.com/facebookresearch/detectron2/
-
-        `torch.nn.SyncBatchNorm` has known unknown bugs.
-        It produces significantly worse AP (and sometimes goes NaN)
-        when the batch size on each worker is quite different
-        (e.g., when scale augmentation is used).
-        This phenomenon also occurs when the multi-modality feature fusion
-        modules of multi-modality detectors use SyncBN.
-
-        Use this implementation before `nn.SyncBatchNorm` is fixed.
-        It is slower than `nn.SyncBatchNorm`.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.fp16_enabled = False
-
-    # customized normalization layer still needs this decorator
-    # to force the input to be fp32 and the output to be fp16
-    # TODO: make mmcv fp16 utils handle customized norm layers
-    @force_fp32(out_fp16=True)
-    def forward(self, input):
-        """
-        Args:
-            Input (tensor): Feature has shape (N, C, H, W).
-
-        Returns:
-            tensor: Has shape (N, C, H, W), same shape as input.
-        """
-        assert input.dtype == torch.float32, \
-            f'input should be in float32 type, got {input.dtype}'
-        using_dist = dist.is_available() and dist.is_initialized()
-        if (not using_dist) or \
-                dist.get_world_size() == 1 or \
-                not self.training:
-            return super().forward(input)
-
-        assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
-        C = input.shape[1]
-        mean = torch.mean(input, dim=[0, 2, 3])
-        meansqr = torch.mean(input * input, dim=[0, 2, 3])
-
-        vec = torch.cat([mean, meansqr], dim=0)
-        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
-
-        mean, meansqr = torch.split(vec, C)
-        var = meansqr - mean * mean
-        self.running_mean += self.momentum * (
-            mean.detach() - self.running_mean)
-        self.running_var += self.momentum * (var.detach() - self.running_var)
-
-        invstd = torch.rsqrt(var + self.eps)
-        scale = self.weight * invstd
-        bias = self.bias - mean * scale
-        scale = scale.reshape(1, -1, 1, 1)
-        bias = bias.reshape(1, -1, 1, 1)
-        return input * scale + bias
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import NORM_LAYERS
+from mmcv.runner import force_fp32
+from torch import distributed as dist
+from torch import nn as nn
+from torch.autograd.function import Function
+
+
+class AllReduce(Function):
+
+    @staticmethod
+    def forward(ctx, input):
+        input_list = [
+            torch.zeros_like(input) for k in range(dist.get_world_size())
+        ]
+        # Use allgather instead of allreduce in-place operations is unreliable
+        dist.all_gather(input_list, input, async_op=False)
+        inputs = torch.stack(input_list, dim=0)
+        return torch.sum(inputs, dim=0)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        dist.all_reduce(grad_output, async_op=False)
+        return grad_output
+
+
+@NORM_LAYERS.register_module('naiveSyncBN1d')
+class NaiveSyncBatchNorm1d(nn.BatchNorm1d):
+    """Synchronized Batch Normalization for 3D Tensors.
+
+    Note:
+        This implementation is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        `torch.nn.SyncBatchNorm` has known unknown bugs.
+        It produces significantly worse AP (and sometimes goes NaN)
+        when the batch size on each worker is quite different
+        (e.g., when scale augmentation is used).
+        In 3D detection, different workers has points of different shapes,
+        which also cause instability.
+
+        Use this implementation before `nn.SyncBatchNorm` is fixed.
+        It is slower than `nn.SyncBatchNorm`.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.fp16_enabled = False
+
+    # customized normalization layer still needs this decorator
+    # to force the input to be fp32 and the output to be fp16
+    # TODO: make mmcv fp16 utils handle customized norm layers
+    @force_fp32(out_fp16=True)
+    def forward(self, input):
+        """
+        Args:
+            input (tensor): Has shape (N, C) or (N, C, L), where N is
+                the batch size, C is the number of features or
+                channels, and L is the sequence length
+
+        Returns:
+            tensor: Has shape (N, C) or (N, C, L), has same shape
+            as input.
+        """
+        assert input.dtype == torch.float32, \
+            f'input should be in float32 type, got {input.dtype}'
+        using_dist = dist.is_available() and dist.is_initialized()
+        if (not using_dist) or dist.get_world_size() == 1 \
+                or not self.training:
+            return super().forward(input)
+        assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
+        is_two_dim = input.dim() == 2
+        if is_two_dim:
+            input = input.unsqueeze(2)
+
+        C = input.shape[1]
+        mean = torch.mean(input, dim=[0, 2])
+        meansqr = torch.mean(input * input, dim=[0, 2])
+
+        vec = torch.cat([mean, meansqr], dim=0)
+        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
+
+        mean, meansqr = torch.split(vec, C)
+        var = meansqr - mean * mean
+        self.running_mean += self.momentum * (
+            mean.detach() - self.running_mean)
+        self.running_var += self.momentum * (var.detach() - self.running_var)
+
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1)
+        bias = bias.reshape(1, -1, 1)
+        output = input * scale + bias
+        if is_two_dim:
+            output = output.squeeze(2)
+        return output
+
+
+@NORM_LAYERS.register_module('naiveSyncBN2d')
+class NaiveSyncBatchNorm2d(nn.BatchNorm2d):
+    """Synchronized Batch Normalization for 4D Tensors.
+
+    Note:
+        This implementation is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        `torch.nn.SyncBatchNorm` has known unknown bugs.
+        It produces significantly worse AP (and sometimes goes NaN)
+        when the batch size on each worker is quite different
+        (e.g., when scale augmentation is used).
+        This phenomenon also occurs when the multi-modality feature fusion
+        modules of multi-modality detectors use SyncBN.
+
+        Use this implementation before `nn.SyncBatchNorm` is fixed.
+        It is slower than `nn.SyncBatchNorm`.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.fp16_enabled = False
+
+    # customized normalization layer still needs this decorator
+    # to force the input to be fp32 and the output to be fp16
+    # TODO: make mmcv fp16 utils handle customized norm layers
+    @force_fp32(out_fp16=True)
+    def forward(self, input):
+        """
+        Args:
+            Input (tensor): Feature has shape (N, C, H, W).
+
+        Returns:
+            tensor: Has shape (N, C, H, W), same shape as input.
+        """
+        assert input.dtype == torch.float32, \
+            f'input should be in float32 type, got {input.dtype}'
+        using_dist = dist.is_available() and dist.is_initialized()
+        if (not using_dist) or \
+                dist.get_world_size() == 1 or \
+                not self.training:
+            return super().forward(input)
+
+        assert input.shape[0] > 0, 'SyncBN does not support empty inputs'
+        C = input.shape[1]
+        mean = torch.mean(input, dim=[0, 2, 3])
+        meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+        vec = torch.cat([mean, meansqr], dim=0)
+        vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size())
+
+        mean, meansqr = torch.split(vec, C)
+        var = meansqr - mean * mean
+        self.running_mean += self.momentum * (
+            mean.detach() - self.running_mean)
+        self.running_var += self.momentum * (var.detach() - self.running_var)
+
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+        return input * scale + bias
diff --git a/mmdet3d/ops/paconv/__init__.py b/mmdet3d/ops/paconv/__init__.py
index d71c766..e8712dd 100644
--- a/mmdet3d/ops/paconv/__init__.py
+++ b/mmdet3d/ops/paconv/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .paconv import PAConv, PAConvCUDA
-
-__all__ = ['PAConv', 'PAConvCUDA']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .paconv import PAConv, PAConvCUDA
+
+__all__ = ['PAConv', 'PAConvCUDA']
diff --git a/mmdet3d/ops/paconv/paconv.py b/mmdet3d/ops/paconv/paconv.py
index bda8bfe..26b1949 100644
--- a/mmdet3d/ops/paconv/paconv.py
+++ b/mmdet3d/ops/paconv/paconv.py
@@ -1,392 +1,392 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import torch
-from mmcv.cnn import (ConvModule, build_activation_layer, build_norm_layer,
-                      constant_init)
-from mmcv.ops import assign_score_withk as assign_score_cuda
-from torch import nn as nn
-from torch.nn import functional as F
-
-from .utils import assign_kernel_withoutk, assign_score, calc_euclidian_dist
-
-
-class ScoreNet(nn.Module):
-    r"""ScoreNet that outputs coefficient scores to assemble kernel weights in
-    the weight bank according to the relative position of point pairs.
-
-    Args:
-        mlp_channels (List[int]): Hidden unit sizes of SharedMLP layers.
-        last_bn (bool, optional): Whether to use BN on the last output of mlps.
-            Defaults to False.
-        score_norm (str, optional): Normalization function of output scores.
-            Can be 'softmax', 'sigmoid' or 'identity'. Defaults to 'softmax'.
-        temp_factor (float, optional): Temperature factor to scale the output
-            scores before softmax. Defaults to 1.0.
-        norm_cfg (dict, optional): Type of normalization method.
-            Defaults to dict(type='BN2d').
-        bias (bool | str, optional): If specified as `auto`, it will be decided
-            by the norm_cfg. Bias will be set as True if `norm_cfg` is None,
-            otherwise False. Defaults to 'auto'.
-
-    Note:
-        The official code applies xavier_init to all Conv layers in ScoreNet,
-            see `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg
-            /model/pointnet2/paconv.py#L105>`_. However in our experiments, we
-            did not find much difference in applying such xavier initialization
-            or not. So we neglect this initialization in our implementation.
-    """
-
-    def __init__(self,
-                 mlp_channels,
-                 last_bn=False,
-                 score_norm='softmax',
-                 temp_factor=1.0,
-                 norm_cfg=dict(type='BN2d'),
-                 bias='auto'):
-        super(ScoreNet, self).__init__()
-
-        assert score_norm in ['softmax', 'sigmoid', 'identity'], \
-            f'unsupported score_norm function {score_norm}'
-
-        self.score_norm = score_norm
-        self.temp_factor = temp_factor
-
-        self.mlps = nn.Sequential()
-        for i in range(len(mlp_channels) - 2):
-            self.mlps.add_module(
-                f'layer{i}',
-                ConvModule(
-                    mlp_channels[i],
-                    mlp_channels[i + 1],
-                    kernel_size=(1, 1),
-                    stride=(1, 1),
-                    conv_cfg=dict(type='Conv2d'),
-                    norm_cfg=norm_cfg,
-                    bias=bias))
-
-        # for the last mlp that outputs scores, no relu and possibly no bn
-        i = len(mlp_channels) - 2
-        self.mlps.add_module(
-            f'layer{i}',
-            ConvModule(
-                mlp_channels[i],
-                mlp_channels[i + 1],
-                kernel_size=(1, 1),
-                stride=(1, 1),
-                conv_cfg=dict(type='Conv2d'),
-                norm_cfg=norm_cfg if last_bn else None,
-                act_cfg=None,
-                bias=bias))
-
-    def forward(self, xyz_features):
-        """Forward.
-
-        Args:
-            xyz_features (torch.Tensor): (B, C, N, K), features constructed
-                from xyz coordinates of point pairs. May contain relative
-                positions, Euclidean distance, etc.
-
-        Returns:
-            torch.Tensor: (B, N, K, M), predicted scores for `M` kernels.
-        """
-        scores = self.mlps(xyz_features)  # (B, M, N, K)
-
-        # perform score normalization
-        if self.score_norm == 'softmax':
-            scores = F.softmax(scores / self.temp_factor, dim=1)
-        elif self.score_norm == 'sigmoid':
-            scores = torch.sigmoid(scores / self.temp_factor)
-        else:  # 'identity'
-            scores = scores
-
-        scores = scores.permute(0, 2, 3, 1)  # (B, N, K, M)
-
-        return scores
-
-
-class PAConv(nn.Module):
-    """Non-CUDA version of PAConv.
-
-    PAConv stores a trainable weight bank containing several kernel weights.
-    Given input points and features, it computes coefficient scores to assemble
-    those kernels to form conv kernels, and then runs convolution on the input.
-
-    Args:
-        in_channels (int): Input channels of point features.
-        out_channels (int): Output channels of point features.
-        num_kernels (int): Number of kernel weights in the weight bank.
-        norm_cfg (dict, optional): Type of normalization method.
-            Defaults to dict(type='BN2d', momentum=0.1).
-        act_cfg (dict, optional): Type of activation method.
-            Defaults to dict(type='ReLU', inplace=True).
-        scorenet_input (str, optional): Type of input to ScoreNet.
-            Can be 'identity', 'w_neighbor' or 'w_neighbor_dist'.
-            Defaults to 'w_neighbor_dist'.
-        weight_bank_init (str, optional): Init method of weight bank kernels.
-            Can be 'kaiming' or 'xavier'. Defaults to 'kaiming'.
-        kernel_input (str, optional): Input features to be multiplied with
-            kernel weights. Can be 'identity' or 'w_neighbor'.
-            Defaults to 'w_neighbor'.
-        scorenet_cfg (dict, optional): Config of the ScoreNet module, which
-            may contain the following keys and values:
-
-            - mlp_channels (List[int]): Hidden units of MLPs.
-            - score_norm (str): Normalization function of output scores.
-                Can be 'softmax', 'sigmoid' or 'identity'.
-            - temp_factor (float): Temperature factor to scale the output
-                scores before softmax.
-            - last_bn (bool): Whether to use BN on the last output of mlps.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_kernels,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 act_cfg=dict(type='ReLU', inplace=True),
-                 scorenet_input='w_neighbor_dist',
-                 weight_bank_init='kaiming',
-                 kernel_input='w_neighbor',
-                 scorenet_cfg=dict(
-                     mlp_channels=[16, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
-        super(PAConv, self).__init__()
-
-        # determine weight kernel size according to used features
-        if kernel_input == 'identity':
-            # only use grouped_features
-            kernel_mul = 1
-        elif kernel_input == 'w_neighbor':
-            # concat of (grouped_features - center_features, grouped_features)
-            kernel_mul = 2
-        else:
-            raise NotImplementedError(
-                f'unsupported kernel_input {kernel_input}')
-        self.kernel_input = kernel_input
-        in_channels = kernel_mul * in_channels
-
-        # determine mlp channels in ScoreNet according to used xyz features
-        if scorenet_input == 'identity':
-            # only use relative position (grouped_xyz - center_xyz)
-            self.scorenet_in_channels = 3
-        elif scorenet_input == 'w_neighbor':
-            # (grouped_xyz - center_xyz, grouped_xyz)
-            self.scorenet_in_channels = 6
-        elif scorenet_input == 'w_neighbor_dist':
-            # (center_xyz, grouped_xyz - center_xyz, Euclidean distance)
-            self.scorenet_in_channels = 7
-        else:
-            raise NotImplementedError(
-                f'unsupported scorenet_input {scorenet_input}')
-        self.scorenet_input = scorenet_input
-
-        # construct kernel weights in weight bank
-        # self.weight_bank is of shape [C, num_kernels * out_c]
-        # where C can be in_c or (2 * in_c)
-        if weight_bank_init == 'kaiming':
-            weight_init = nn.init.kaiming_normal_
-        elif weight_bank_init == 'xavier':
-            weight_init = nn.init.xavier_normal_
-        else:
-            raise NotImplementedError(
-                f'unsupported weight bank init method {weight_bank_init}')
-
-        self.num_kernels = num_kernels  # the parameter `m` in the paper
-        weight_bank = weight_init(
-            torch.empty(self.num_kernels, in_channels, out_channels))
-        weight_bank = weight_bank.permute(1, 0, 2).reshape(
-            in_channels, self.num_kernels * out_channels).contiguous()
-        self.weight_bank = nn.Parameter(weight_bank, requires_grad=True)
-
-        # construct ScoreNet
-        scorenet_cfg_ = copy.deepcopy(scorenet_cfg)
-        scorenet_cfg_['mlp_channels'].insert(0, self.scorenet_in_channels)
-        scorenet_cfg_['mlp_channels'].append(self.num_kernels)
-        self.scorenet = ScoreNet(**scorenet_cfg_)
-
-        self.bn = build_norm_layer(norm_cfg, out_channels)[1] if \
-            norm_cfg is not None else None
-        self.activate = build_activation_layer(act_cfg) if \
-            act_cfg is not None else None
-
-        # set some basic attributes of Conv layers
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-
-        self.init_weights()
-
-    def init_weights(self):
-        """Initialize weights of shared MLP layers and BN layers."""
-        if self.bn is not None:
-            constant_init(self.bn, val=1, bias=0)
-
-    def _prepare_scorenet_input(self, points_xyz):
-        """Prepare input point pairs features for self.ScoreNet.
-
-        Args:
-            points_xyz (torch.Tensor): (B, 3, npoint, K)
-                Coordinates of the grouped points.
-
-        Returns:
-            torch.Tensor: (B, C, npoint, K)
-                The generated features per point pair.
-        """
-        B, _, npoint, K = points_xyz.size()
-        center_xyz = points_xyz[..., :1].repeat(1, 1, 1, K)
-        xyz_diff = points_xyz - center_xyz  # [B, 3, npoint, K]
-        if self.scorenet_input == 'identity':
-            xyz_features = xyz_diff
-        elif self.scorenet_input == 'w_neighbor':
-            xyz_features = torch.cat((xyz_diff, points_xyz), dim=1)
-        else:  # w_neighbor_dist
-            euclidian_dist = calc_euclidian_dist(
-                center_xyz.permute(0, 2, 3, 1).reshape(B * npoint * K, 3),
-                points_xyz.permute(0, 2, 3, 1).reshape(B * npoint * K, 3)).\
-                    reshape(B, 1, npoint, K)
-            xyz_features = torch.cat((center_xyz, xyz_diff, euclidian_dist),
-                                     dim=1)
-        return xyz_features
-
-    def forward(self, inputs):
-        """Forward.
-
-        Args:
-            inputs (tuple(torch.Tensor)):
-
-                - features (torch.Tensor): (B, in_c, npoint, K)
-                    Features of the queried points.
-                - points_xyz (torch.Tensor): (B, 3, npoint, K)
-                    Coordinates of the grouped points.
-
-        Returns:
-            Tuple[torch.Tensor]:
-
-                - new_features: (B, out_c, npoint, K), features after PAConv.
-                - points_xyz: same as input.
-        """
-        features, points_xyz = inputs
-        B, _, npoint, K = features.size()
-
-        if self.kernel_input == 'w_neighbor':
-            center_features = features[..., :1].repeat(1, 1, 1, K)
-            features_diff = features - center_features
-            # to (B, 2 * in_c, npoint, K)
-            features = torch.cat((features_diff, features), dim=1)
-
-        # prepare features for between each point and its grouping center
-        xyz_features = self._prepare_scorenet_input(points_xyz)
-
-        # scores to assemble kernel weights
-        scores = self.scorenet(xyz_features)  # [B, npoint, K, m]
-
-        # first compute out features over all kernels
-        # features is [B, C, npoint, K], weight_bank is [C, m * out_c]
-        new_features = torch.matmul(
-            features.permute(0, 2, 3, 1),
-            self.weight_bank).view(B, npoint, K, self.num_kernels,
-                                   -1)  # [B, npoint, K, m, out_c]
-
-        # then aggregate using scores
-        new_features = assign_score(scores, new_features)
-        # to [B, out_c, npoint, K]
-        new_features = new_features.permute(0, 3, 1, 2).contiguous()
-
-        if self.bn is not None:
-            new_features = self.bn(new_features)
-        if self.activate is not None:
-            new_features = self.activate(new_features)
-
-        # in order to keep input output consistency
-        # so that we can wrap PAConv in Sequential
-        return (new_features, points_xyz)
-
-
-class PAConvCUDA(PAConv):
-    """CUDA version of PAConv that implements a cuda op to efficiently perform
-    kernel assembling.
-
-    Different from vanilla PAConv, the input features of this function is not
-    grouped by centers. Instead, they will be queried on-the-fly by the
-    additional input `points_idx`. This avoids the large intermediate matrix.
-    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
-    more detailed descriptions.
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_kernels,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 act_cfg=dict(type='ReLU', inplace=True),
-                 scorenet_input='w_neighbor_dist',
-                 weight_bank_init='kaiming',
-                 kernel_input='w_neighbor',
-                 scorenet_cfg=dict(
-                     mlp_channels=[8, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
-        super(PAConvCUDA, self).__init__(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            num_kernels=num_kernels,
-            norm_cfg=norm_cfg,
-            act_cfg=act_cfg,
-            scorenet_input=scorenet_input,
-            weight_bank_init=weight_bank_init,
-            kernel_input=kernel_input,
-            scorenet_cfg=scorenet_cfg)
-
-        assert self.kernel_input == 'w_neighbor', \
-            'CUDA implemented PAConv only supports w_neighbor kernel_input'
-
-    def forward(self, inputs):
-        """Forward.
-
-        Args:
-            inputs (tuple(torch.Tensor)):
-
-                - features (torch.Tensor): (B, in_c, N)
-                    Features of all points in the current point cloud.
-                    Different from non-CUDA version PAConv, here the features
-                        are not grouped by each center to form a K dim.
-                - points_xyz (torch.Tensor): (B, 3, npoint, K)
-                    Coordinates of the grouped points.
-                - points_idx (torch.Tensor): (B, npoint, K)
-                    Index of the grouped points.
-
-        Returns:
-            Tuple[torch.Tensor]:
-
-                - new_features: (B, out_c, npoint, K), features after PAConv.
-                - points_xyz: same as input.
-                - points_idx: same as input.
-        """
-        features, points_xyz, points_idx = inputs
-
-        # prepare features for between each point and its grouping center
-        xyz_features = self._prepare_scorenet_input(points_xyz)
-
-        # scores to assemble kernel weights
-        scores = self.scorenet(xyz_features)  # [B, npoint, K, m]
-
-        # pre-compute features for points and centers separately
-        # features is [B, in_c, N], weight_bank is [C, m * out_dim]
-        point_feat, center_feat = assign_kernel_withoutk(
-            features, self.weight_bank, self.num_kernels)
-
-        # aggregate features using custom cuda op
-        new_features = assign_score_cuda(
-            scores, point_feat, center_feat, points_idx,
-            'sum').contiguous()  # [B, out_c, npoint, K]
-
-        if self.bn is not None:
-            new_features = self.bn(new_features)
-        if self.activate is not None:
-            new_features = self.activate(new_features)
-
-        # in order to keep input output consistency
-        return (new_features, points_xyz, points_idx)
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import torch
+from mmcv.cnn import (ConvModule, build_activation_layer, build_norm_layer,
+                      constant_init)
+from mmcv.ops import assign_score_withk as assign_score_cuda
+from torch import nn as nn
+from torch.nn import functional as F
+
+from .utils import assign_kernel_withoutk, assign_score, calc_euclidian_dist
+
+
+class ScoreNet(nn.Module):
+    r"""ScoreNet that outputs coefficient scores to assemble kernel weights in
+    the weight bank according to the relative position of point pairs.
+
+    Args:
+        mlp_channels (List[int]): Hidden unit sizes of SharedMLP layers.
+        last_bn (bool, optional): Whether to use BN on the last output of mlps.
+            Defaults to False.
+        score_norm (str, optional): Normalization function of output scores.
+            Can be 'softmax', 'sigmoid' or 'identity'. Defaults to 'softmax'.
+        temp_factor (float, optional): Temperature factor to scale the output
+            scores before softmax. Defaults to 1.0.
+        norm_cfg (dict, optional): Type of normalization method.
+            Defaults to dict(type='BN2d').
+        bias (bool | str, optional): If specified as `auto`, it will be decided
+            by the norm_cfg. Bias will be set as True if `norm_cfg` is None,
+            otherwise False. Defaults to 'auto'.
+
+    Note:
+        The official code applies xavier_init to all Conv layers in ScoreNet,
+            see `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg
+            /model/pointnet2/paconv.py#L105>`_. However in our experiments, we
+            did not find much difference in applying such xavier initialization
+            or not. So we neglect this initialization in our implementation.
+    """
+
+    def __init__(self,
+                 mlp_channels,
+                 last_bn=False,
+                 score_norm='softmax',
+                 temp_factor=1.0,
+                 norm_cfg=dict(type='BN2d'),
+                 bias='auto'):
+        super(ScoreNet, self).__init__()
+
+        assert score_norm in ['softmax', 'sigmoid', 'identity'], \
+            f'unsupported score_norm function {score_norm}'
+
+        self.score_norm = score_norm
+        self.temp_factor = temp_factor
+
+        self.mlps = nn.Sequential()
+        for i in range(len(mlp_channels) - 2):
+            self.mlps.add_module(
+                f'layer{i}',
+                ConvModule(
+                    mlp_channels[i],
+                    mlp_channels[i + 1],
+                    kernel_size=(1, 1),
+                    stride=(1, 1),
+                    conv_cfg=dict(type='Conv2d'),
+                    norm_cfg=norm_cfg,
+                    bias=bias))
+
+        # for the last mlp that outputs scores, no relu and possibly no bn
+        i = len(mlp_channels) - 2
+        self.mlps.add_module(
+            f'layer{i}',
+            ConvModule(
+                mlp_channels[i],
+                mlp_channels[i + 1],
+                kernel_size=(1, 1),
+                stride=(1, 1),
+                conv_cfg=dict(type='Conv2d'),
+                norm_cfg=norm_cfg if last_bn else None,
+                act_cfg=None,
+                bias=bias))
+
+    def forward(self, xyz_features):
+        """Forward.
+
+        Args:
+            xyz_features (torch.Tensor): (B, C, N, K), features constructed
+                from xyz coordinates of point pairs. May contain relative
+                positions, Euclidean distance, etc.
+
+        Returns:
+            torch.Tensor: (B, N, K, M), predicted scores for `M` kernels.
+        """
+        scores = self.mlps(xyz_features)  # (B, M, N, K)
+
+        # perform score normalization
+        if self.score_norm == 'softmax':
+            scores = F.softmax(scores / self.temp_factor, dim=1)
+        elif self.score_norm == 'sigmoid':
+            scores = torch.sigmoid(scores / self.temp_factor)
+        else:  # 'identity'
+            scores = scores
+
+        scores = scores.permute(0, 2, 3, 1)  # (B, N, K, M)
+
+        return scores
+
+
+class PAConv(nn.Module):
+    """Non-CUDA version of PAConv.
+
+    PAConv stores a trainable weight bank containing several kernel weights.
+    Given input points and features, it computes coefficient scores to assemble
+    those kernels to form conv kernels, and then runs convolution on the input.
+
+    Args:
+        in_channels (int): Input channels of point features.
+        out_channels (int): Output channels of point features.
+        num_kernels (int): Number of kernel weights in the weight bank.
+        norm_cfg (dict, optional): Type of normalization method.
+            Defaults to dict(type='BN2d', momentum=0.1).
+        act_cfg (dict, optional): Type of activation method.
+            Defaults to dict(type='ReLU', inplace=True).
+        scorenet_input (str, optional): Type of input to ScoreNet.
+            Can be 'identity', 'w_neighbor' or 'w_neighbor_dist'.
+            Defaults to 'w_neighbor_dist'.
+        weight_bank_init (str, optional): Init method of weight bank kernels.
+            Can be 'kaiming' or 'xavier'. Defaults to 'kaiming'.
+        kernel_input (str, optional): Input features to be multiplied with
+            kernel weights. Can be 'identity' or 'w_neighbor'.
+            Defaults to 'w_neighbor'.
+        scorenet_cfg (dict, optional): Config of the ScoreNet module, which
+            may contain the following keys and values:
+
+            - mlp_channels (List[int]): Hidden units of MLPs.
+            - score_norm (str): Normalization function of output scores.
+                Can be 'softmax', 'sigmoid' or 'identity'.
+            - temp_factor (float): Temperature factor to scale the output
+                scores before softmax.
+            - last_bn (bool): Whether to use BN on the last output of mlps.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_kernels,
+                 norm_cfg=dict(type='BN2d', momentum=0.1),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 scorenet_input='w_neighbor_dist',
+                 weight_bank_init='kaiming',
+                 kernel_input='w_neighbor',
+                 scorenet_cfg=dict(
+                     mlp_channels=[16, 16, 16],
+                     score_norm='softmax',
+                     temp_factor=1.0,
+                     last_bn=False)):
+        super(PAConv, self).__init__()
+
+        # determine weight kernel size according to used features
+        if kernel_input == 'identity':
+            # only use grouped_features
+            kernel_mul = 1
+        elif kernel_input == 'w_neighbor':
+            # concat of (grouped_features - center_features, grouped_features)
+            kernel_mul = 2
+        else:
+            raise NotImplementedError(
+                f'unsupported kernel_input {kernel_input}')
+        self.kernel_input = kernel_input
+        in_channels = kernel_mul * in_channels
+
+        # determine mlp channels in ScoreNet according to used xyz features
+        if scorenet_input == 'identity':
+            # only use relative position (grouped_xyz - center_xyz)
+            self.scorenet_in_channels = 3
+        elif scorenet_input == 'w_neighbor':
+            # (grouped_xyz - center_xyz, grouped_xyz)
+            self.scorenet_in_channels = 6
+        elif scorenet_input == 'w_neighbor_dist':
+            # (center_xyz, grouped_xyz - center_xyz, Euclidean distance)
+            self.scorenet_in_channels = 7
+        else:
+            raise NotImplementedError(
+                f'unsupported scorenet_input {scorenet_input}')
+        self.scorenet_input = scorenet_input
+
+        # construct kernel weights in weight bank
+        # self.weight_bank is of shape [C, num_kernels * out_c]
+        # where C can be in_c or (2 * in_c)
+        if weight_bank_init == 'kaiming':
+            weight_init = nn.init.kaiming_normal_
+        elif weight_bank_init == 'xavier':
+            weight_init = nn.init.xavier_normal_
+        else:
+            raise NotImplementedError(
+                f'unsupported weight bank init method {weight_bank_init}')
+
+        self.num_kernels = num_kernels  # the parameter `m` in the paper
+        weight_bank = weight_init(
+            torch.empty(self.num_kernels, in_channels, out_channels))
+        weight_bank = weight_bank.permute(1, 0, 2).reshape(
+            in_channels, self.num_kernels * out_channels).contiguous()
+        self.weight_bank = nn.Parameter(weight_bank, requires_grad=True)
+
+        # construct ScoreNet
+        scorenet_cfg_ = copy.deepcopy(scorenet_cfg)
+        scorenet_cfg_['mlp_channels'].insert(0, self.scorenet_in_channels)
+        scorenet_cfg_['mlp_channels'].append(self.num_kernels)
+        self.scorenet = ScoreNet(**scorenet_cfg_)
+
+        self.bn = build_norm_layer(norm_cfg, out_channels)[1] if \
+            norm_cfg is not None else None
+        self.activate = build_activation_layer(act_cfg) if \
+            act_cfg is not None else None
+
+        # set some basic attributes of Conv layers
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Initialize weights of shared MLP layers and BN layers."""
+        if self.bn is not None:
+            constant_init(self.bn, val=1, bias=0)
+
+    def _prepare_scorenet_input(self, points_xyz):
+        """Prepare input point pairs features for self.ScoreNet.
+
+        Args:
+            points_xyz (torch.Tensor): (B, 3, npoint, K)
+                Coordinates of the grouped points.
+
+        Returns:
+            torch.Tensor: (B, C, npoint, K)
+                The generated features per point pair.
+        """
+        B, _, npoint, K = points_xyz.size()
+        center_xyz = points_xyz[..., :1].repeat(1, 1, 1, K)
+        xyz_diff = points_xyz - center_xyz  # [B, 3, npoint, K]
+        if self.scorenet_input == 'identity':
+            xyz_features = xyz_diff
+        elif self.scorenet_input == 'w_neighbor':
+            xyz_features = torch.cat((xyz_diff, points_xyz), dim=1)
+        else:  # w_neighbor_dist
+            euclidian_dist = calc_euclidian_dist(
+                center_xyz.permute(0, 2, 3, 1).reshape(B * npoint * K, 3),
+                points_xyz.permute(0, 2, 3, 1).reshape(B * npoint * K, 3)).\
+                    reshape(B, 1, npoint, K)
+            xyz_features = torch.cat((center_xyz, xyz_diff, euclidian_dist),
+                                     dim=1)
+        return xyz_features
+
+    def forward(self, inputs):
+        """Forward.
+
+        Args:
+            inputs (tuple(torch.Tensor)):
+
+                - features (torch.Tensor): (B, in_c, npoint, K)
+                    Features of the queried points.
+                - points_xyz (torch.Tensor): (B, 3, npoint, K)
+                    Coordinates of the grouped points.
+
+        Returns:
+            Tuple[torch.Tensor]:
+
+                - new_features: (B, out_c, npoint, K), features after PAConv.
+                - points_xyz: same as input.
+        """
+        features, points_xyz = inputs
+        B, _, npoint, K = features.size()
+
+        if self.kernel_input == 'w_neighbor':
+            center_features = features[..., :1].repeat(1, 1, 1, K)
+            features_diff = features - center_features
+            # to (B, 2 * in_c, npoint, K)
+            features = torch.cat((features_diff, features), dim=1)
+
+        # prepare features for between each point and its grouping center
+        xyz_features = self._prepare_scorenet_input(points_xyz)
+
+        # scores to assemble kernel weights
+        scores = self.scorenet(xyz_features)  # [B, npoint, K, m]
+
+        # first compute out features over all kernels
+        # features is [B, C, npoint, K], weight_bank is [C, m * out_c]
+        new_features = torch.matmul(
+            features.permute(0, 2, 3, 1),
+            self.weight_bank).view(B, npoint, K, self.num_kernels,
+                                   -1)  # [B, npoint, K, m, out_c]
+
+        # then aggregate using scores
+        new_features = assign_score(scores, new_features)
+        # to [B, out_c, npoint, K]
+        new_features = new_features.permute(0, 3, 1, 2).contiguous()
+
+        if self.bn is not None:
+            new_features = self.bn(new_features)
+        if self.activate is not None:
+            new_features = self.activate(new_features)
+
+        # in order to keep input output consistency
+        # so that we can wrap PAConv in Sequential
+        return (new_features, points_xyz)
+
+
+class PAConvCUDA(PAConv):
+    """CUDA version of PAConv that implements a cuda op to efficiently perform
+    kernel assembling.
+
+    Different from vanilla PAConv, the input features of this function is not
+    grouped by centers. Instead, they will be queried on-the-fly by the
+    additional input `points_idx`. This avoids the large intermediate matrix.
+    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
+    more detailed descriptions.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_kernels,
+                 norm_cfg=dict(type='BN2d', momentum=0.1),
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 scorenet_input='w_neighbor_dist',
+                 weight_bank_init='kaiming',
+                 kernel_input='w_neighbor',
+                 scorenet_cfg=dict(
+                     mlp_channels=[8, 16, 16],
+                     score_norm='softmax',
+                     temp_factor=1.0,
+                     last_bn=False)):
+        super(PAConvCUDA, self).__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_kernels=num_kernels,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            scorenet_input=scorenet_input,
+            weight_bank_init=weight_bank_init,
+            kernel_input=kernel_input,
+            scorenet_cfg=scorenet_cfg)
+
+        assert self.kernel_input == 'w_neighbor', \
+            'CUDA implemented PAConv only supports w_neighbor kernel_input'
+
+    def forward(self, inputs):
+        """Forward.
+
+        Args:
+            inputs (tuple(torch.Tensor)):
+
+                - features (torch.Tensor): (B, in_c, N)
+                    Features of all points in the current point cloud.
+                    Different from non-CUDA version PAConv, here the features
+                        are not grouped by each center to form a K dim.
+                - points_xyz (torch.Tensor): (B, 3, npoint, K)
+                    Coordinates of the grouped points.
+                - points_idx (torch.Tensor): (B, npoint, K)
+                    Index of the grouped points.
+
+        Returns:
+            Tuple[torch.Tensor]:
+
+                - new_features: (B, out_c, npoint, K), features after PAConv.
+                - points_xyz: same as input.
+                - points_idx: same as input.
+        """
+        features, points_xyz, points_idx = inputs
+
+        # prepare features for between each point and its grouping center
+        xyz_features = self._prepare_scorenet_input(points_xyz)
+
+        # scores to assemble kernel weights
+        scores = self.scorenet(xyz_features)  # [B, npoint, K, m]
+
+        # pre-compute features for points and centers separately
+        # features is [B, in_c, N], weight_bank is [C, m * out_dim]
+        point_feat, center_feat = assign_kernel_withoutk(
+            features, self.weight_bank, self.num_kernels)
+
+        # aggregate features using custom cuda op
+        new_features = assign_score_cuda(
+            scores, point_feat, center_feat, points_idx,
+            'sum').contiguous()  # [B, out_c, npoint, K]
+
+        if self.bn is not None:
+            new_features = self.bn(new_features)
+        if self.activate is not None:
+            new_features = self.activate(new_features)
+
+        # in order to keep input output consistency
+        return (new_features, points_xyz, points_idx)
diff --git a/mmdet3d/ops/paconv/utils.py b/mmdet3d/ops/paconv/utils.py
index 68e71d5..3690430 100644
--- a/mmdet3d/ops/paconv/utils.py
+++ b/mmdet3d/ops/paconv/utils.py
@@ -1,87 +1,87 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-
-def calc_euclidian_dist(xyz1, xyz2):
-    """Calculate the Euclidean distance between two sets of points.
-
-    Args:
-        xyz1 (torch.Tensor): (N, 3), the first set of points.
-        xyz2 (torch.Tensor): (N, 3), the second set of points.
-
-    Returns:
-        torch.Tensor: (N, ), the Euclidean distance between each point pair.
-    """
-    assert xyz1.shape[0] == xyz2.shape[0], 'number of points are not the same'
-    assert xyz1.shape[1] == xyz2.shape[1] == 3, \
-        'points coordinates dimension is not 3'
-    return torch.norm(xyz1 - xyz2, dim=-1)
-
-
-def assign_score(scores, point_features):
-    """Perform weighted sum to aggregate output features according to scores.
-    This function is used in non-CUDA version of PAConv.
-
-    Compared to the cuda op assigh_score_withk, this pytorch implementation
-        pre-computes output features for the neighbors of all centers, and then
-        performs aggregation. It consumes more GPU memories.
-
-    Args:
-        scores (torch.Tensor): (B, npoint, K, M), predicted scores to
-            aggregate weight matrices in the weight bank.
-            `npoint` is the number of sampled centers.
-            `K` is the number of queried neighbors.
-            `M` is the number of weight matrices in the weight bank.
-        point_features (torch.Tensor): (B, npoint, K, M, out_dim)
-            Pre-computed point features to be aggregated.
-
-    Returns:
-        torch.Tensor: (B, npoint, K, out_dim), the aggregated features.
-    """
-    B, npoint, K, M = scores.size()
-    scores = scores.view(B, npoint, K, 1, M)
-    output = torch.matmul(scores, point_features).view(B, npoint, K, -1)
-    return output
-
-
-def assign_kernel_withoutk(features, kernels, M):
-    """Pre-compute features with weight matrices in weight bank. This function
-    is used before cuda op assign_score_withk in CUDA version PAConv.
-
-    Args:
-        features (torch.Tensor): (B, in_dim, N), input features of all points.
-            `N` is the number of points in current point cloud.
-        kernels (torch.Tensor): (2 * in_dim, M * out_dim), weight matrices in
-            the weight bank, transformed from (M, 2 * in_dim, out_dim).
-            `2 * in_dim` is because the input features are concatenation of
-            (point_features - center_features, point_features).
-        M (int): Number of weight matrices in the weight bank.
-
-    Returns:
-        Tuple[torch.Tensor]: both of shape (B, N, M, out_dim):
-
-            - point_features: Pre-computed features for points.
-            - center_features: Pre-computed features for centers.
-    """
-    B, in_dim, N = features.size()
-    feat_trans = features.permute(0, 2, 1)  # [B, N, in_dim]
-    out_feat_half1 = torch.matmul(feat_trans, kernels[:in_dim]).view(
-        B, N, M, -1)  # [B, N, M, out_dim]
-    out_feat_half2 = torch.matmul(feat_trans, kernels[in_dim:]).view(
-        B, N, M, -1)  # [B, N, M, out_dim]
-
-    # TODO: why this hard-coded if condition?
-    # when the network input is only xyz without additional features
-    # xyz will be used as features, so that features.size(1) == 3 % 2 != 0
-    # we need to compensate center_features because otherwise
-    # `point_features - center_features` will result in all zeros?
-    if features.size(1) % 2 != 0:
-        out_feat_half_coord = torch.matmul(
-            feat_trans[:, :, :3],  # [B, N, 3]
-            kernels[in_dim:in_dim + 3]).view(B, N, M, -1)  # [B, N, M, out_dim]
-    else:
-        out_feat_half_coord = torch.zeros_like(out_feat_half2)
-
-    point_features = out_feat_half1 + out_feat_half2
-    center_features = out_feat_half1 + out_feat_half_coord
-    return point_features, center_features
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def calc_euclidian_dist(xyz1, xyz2):
+    """Calculate the Euclidean distance between two sets of points.
+
+    Args:
+        xyz1 (torch.Tensor): (N, 3), the first set of points.
+        xyz2 (torch.Tensor): (N, 3), the second set of points.
+
+    Returns:
+        torch.Tensor: (N, ), the Euclidean distance between each point pair.
+    """
+    assert xyz1.shape[0] == xyz2.shape[0], 'number of points are not the same'
+    assert xyz1.shape[1] == xyz2.shape[1] == 3, \
+        'points coordinates dimension is not 3'
+    return torch.norm(xyz1 - xyz2, dim=-1)
+
+
+def assign_score(scores, point_features):
+    """Perform weighted sum to aggregate output features according to scores.
+    This function is used in non-CUDA version of PAConv.
+
+    Compared to the cuda op assigh_score_withk, this pytorch implementation
+        pre-computes output features for the neighbors of all centers, and then
+        performs aggregation. It consumes more GPU memories.
+
+    Args:
+        scores (torch.Tensor): (B, npoint, K, M), predicted scores to
+            aggregate weight matrices in the weight bank.
+            `npoint` is the number of sampled centers.
+            `K` is the number of queried neighbors.
+            `M` is the number of weight matrices in the weight bank.
+        point_features (torch.Tensor): (B, npoint, K, M, out_dim)
+            Pre-computed point features to be aggregated.
+
+    Returns:
+        torch.Tensor: (B, npoint, K, out_dim), the aggregated features.
+    """
+    B, npoint, K, M = scores.size()
+    scores = scores.view(B, npoint, K, 1, M)
+    output = torch.matmul(scores, point_features).view(B, npoint, K, -1)
+    return output
+
+
+def assign_kernel_withoutk(features, kernels, M):
+    """Pre-compute features with weight matrices in weight bank. This function
+    is used before cuda op assign_score_withk in CUDA version PAConv.
+
+    Args:
+        features (torch.Tensor): (B, in_dim, N), input features of all points.
+            `N` is the number of points in current point cloud.
+        kernels (torch.Tensor): (2 * in_dim, M * out_dim), weight matrices in
+            the weight bank, transformed from (M, 2 * in_dim, out_dim).
+            `2 * in_dim` is because the input features are concatenation of
+            (point_features - center_features, point_features).
+        M (int): Number of weight matrices in the weight bank.
+
+    Returns:
+        Tuple[torch.Tensor]: both of shape (B, N, M, out_dim):
+
+            - point_features: Pre-computed features for points.
+            - center_features: Pre-computed features for centers.
+    """
+    B, in_dim, N = features.size()
+    feat_trans = features.permute(0, 2, 1)  # [B, N, in_dim]
+    out_feat_half1 = torch.matmul(feat_trans, kernels[:in_dim]).view(
+        B, N, M, -1)  # [B, N, M, out_dim]
+    out_feat_half2 = torch.matmul(feat_trans, kernels[in_dim:]).view(
+        B, N, M, -1)  # [B, N, M, out_dim]
+
+    # TODO: why this hard-coded if condition?
+    # when the network input is only xyz without additional features
+    # xyz will be used as features, so that features.size(1) == 3 % 2 != 0
+    # we need to compensate center_features because otherwise
+    # `point_features - center_features` will result in all zeros?
+    if features.size(1) % 2 != 0:
+        out_feat_half_coord = torch.matmul(
+            feat_trans[:, :, :3],  # [B, N, 3]
+            kernels[in_dim:in_dim + 3]).view(B, N, M, -1)  # [B, N, M, out_dim]
+    else:
+        out_feat_half_coord = torch.zeros_like(out_feat_half2)
+
+    point_features = out_feat_half1 + out_feat_half2
+    center_features = out_feat_half1 + out_feat_half_coord
+    return point_features, center_features
diff --git a/mmdet3d/ops/pointnet_modules/__init__.py b/mmdet3d/ops/pointnet_modules/__init__.py
index 99b08eb..89e5b9d 100644
--- a/mmdet3d/ops/pointnet_modules/__init__.py
+++ b/mmdet3d/ops/pointnet_modules/__init__.py
@@ -1,12 +1,12 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .builder import build_sa_module
-from .paconv_sa_module import (PAConvCUDASAModule, PAConvCUDASAModuleMSG,
-                               PAConvSAModule, PAConvSAModuleMSG)
-from .point_fp_module import PointFPModule
-from .point_sa_module import PointSAModule, PointSAModuleMSG
-
-__all__ = [
-    'build_sa_module', 'PointSAModuleMSG', 'PointSAModule', 'PointFPModule',
-    'PAConvSAModule', 'PAConvSAModuleMSG', 'PAConvCUDASAModule',
-    'PAConvCUDASAModuleMSG'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_sa_module
+from .paconv_sa_module import (PAConvCUDASAModule, PAConvCUDASAModuleMSG,
+                               PAConvSAModule, PAConvSAModuleMSG)
+from .point_fp_module import PointFPModule
+from .point_sa_module import PointSAModule, PointSAModuleMSG
+
+__all__ = [
+    'build_sa_module', 'PointSAModuleMSG', 'PointSAModule', 'PointFPModule',
+    'PAConvSAModule', 'PAConvSAModuleMSG', 'PAConvCUDASAModule',
+    'PAConvCUDASAModuleMSG'
+]
diff --git a/mmdet3d/ops/pointnet_modules/builder.py b/mmdet3d/ops/pointnet_modules/builder.py
index 6631cb4..8b34b55 100644
--- a/mmdet3d/ops/pointnet_modules/builder.py
+++ b/mmdet3d/ops/pointnet_modules/builder.py
@@ -1,39 +1,39 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.utils import Registry
-
-SA_MODULES = Registry('point_sa_module')
-
-
-def build_sa_module(cfg, *args, **kwargs):
-    """Build PointNet2 set abstraction (SA) module.
-
-    Args:
-        cfg (None or dict): The SA module config, which should contain:
-            - type (str): Module type.
-            - module args: Args needed to instantiate an SA module.
-        args (argument list): Arguments passed to the `__init__`
-            method of the corresponding module.
-        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
-            method of the corresponding SA module .
-
-    Returns:
-        nn.Module: Created SA module.
-    """
-    if cfg is None:
-        cfg_ = dict(type='PointSAModule')
-    else:
-        if not isinstance(cfg, dict):
-            raise TypeError('cfg must be a dict')
-        if 'type' not in cfg:
-            raise KeyError('the cfg dict must contain the key "type"')
-        cfg_ = cfg.copy()
-
-    module_type = cfg_.pop('type')
-    if module_type not in SA_MODULES:
-        raise KeyError(f'Unrecognized module type {module_type}')
-    else:
-        sa_module = SA_MODULES.get(module_type)
-
-    module = sa_module(*args, **kwargs, **cfg_)
-
-    return module
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry
+
+SA_MODULES = Registry('point_sa_module')
+
+
+def build_sa_module(cfg, *args, **kwargs):
+    """Build PointNet2 set abstraction (SA) module.
+
+    Args:
+        cfg (None or dict): The SA module config, which should contain:
+            - type (str): Module type.
+            - module args: Args needed to instantiate an SA module.
+        args (argument list): Arguments passed to the `__init__`
+            method of the corresponding module.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of the corresponding SA module .
+
+    Returns:
+        nn.Module: Created SA module.
+    """
+    if cfg is None:
+        cfg_ = dict(type='PointSAModule')
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError('cfg must be a dict')
+        if 'type' not in cfg:
+            raise KeyError('the cfg dict must contain the key "type"')
+        cfg_ = cfg.copy()
+
+    module_type = cfg_.pop('type')
+    if module_type not in SA_MODULES:
+        raise KeyError(f'Unrecognized module type {module_type}')
+    else:
+        sa_module = SA_MODULES.get(module_type)
+
+    module = sa_module(*args, **kwargs, **cfg_)
+
+    return module
diff --git a/mmdet3d/ops/pointnet_modules/paconv_sa_module.py b/mmdet3d/ops/pointnet_modules/paconv_sa_module.py
index 361ecbb..5ed0050 100644
--- a/mmdet3d/ops/pointnet_modules/paconv_sa_module.py
+++ b/mmdet3d/ops/pointnet_modules/paconv_sa_module.py
@@ -1,342 +1,342 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from torch import nn as nn
-
-from mmdet3d.ops import PAConv, PAConvCUDA
-from .builder import SA_MODULES
-from .point_sa_module import BasePointSAModule
-
-
-@SA_MODULES.register_module()
-class PAConvSAModuleMSG(BasePointSAModule):
-    r"""Point set abstraction module with multi-scale grouping (MSG) used in
-    PAConv networks.
-
-    Replace the MLPs in `PointSAModuleMSG` with PAConv layers.
-    See the `paper <https://arxiv.org/abs/2103.14635>`_ for more details.
-
-    Args:
-        paconv_num_kernels (list[list[int]]): Number of kernel weights in the
-            weight banks of each layer's PAConv.
-        paconv_kernel_input (str, optional): Input features to be multiplied
-            with kernel weights. Can be 'identity' or 'w_neighbor'.
-            Defaults to 'w_neighbor'.
-        scorenet_input (str, optional): Type of the input to ScoreNet.
-            Defaults to 'w_neighbor_dist'. Can be the following values:
-
-            - 'identity': Use xyz coordinates as input.
-            - 'w_neighbor': Use xyz coordinates and the difference with center
-                points as input.
-            - 'w_neighbor_dist': Use xyz coordinates, the difference with
-                center points and the Euclidean distance as input.
-
-        scorenet_cfg (dict, optional): Config of the ScoreNet module, which
-            may contain the following keys and values:
-
-            - mlp_channels (List[int]): Hidden units of MLPs.
-            - score_norm (str): Normalization function of output scores.
-                Can be 'softmax', 'sigmoid' or 'identity'.
-            - temp_factor (float): Temperature factor to scale the output
-                scores before softmax.
-            - last_bn (bool): Whether to use BN on the last output of mlps.
-    """
-
-    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 bias='auto',
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[16, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
-        super(PAConvSAModuleMSG, self).__init__(
-            num_point=num_point,
-            radii=radii,
-            sample_nums=sample_nums,
-            mlp_channels=mlp_channels,
-            fps_mod=fps_mod,
-            fps_sample_range_list=fps_sample_range_list,
-            dilated_group=dilated_group,
-            use_xyz=use_xyz,
-            pool_mod=pool_mod,
-            normalize_xyz=normalize_xyz,
-            grouper_return_grouped_xyz=True)
-
-        assert len(paconv_num_kernels) == len(mlp_channels)
-        for i in range(len(mlp_channels)):
-            assert len(paconv_num_kernels[i]) == len(mlp_channels[i]) - 1, \
-                'PAConv number of kernel weights wrong'
-
-        # in PAConv, bias only exists in ScoreNet
-        scorenet_cfg['bias'] = bias
-
-        for i in range(len(self.mlp_channels)):
-            mlp_channel = self.mlp_channels[i]
-            if use_xyz:
-                mlp_channel[0] += 3
-
-            num_kernels = paconv_num_kernels[i]
-
-            mlp = nn.Sequential()
-            for i in range(len(mlp_channel) - 1):
-                mlp.add_module(
-                    f'layer{i}',
-                    PAConv(
-                        mlp_channel[i],
-                        mlp_channel[i + 1],
-                        num_kernels[i],
-                        norm_cfg=norm_cfg,
-                        kernel_input=paconv_kernel_input,
-                        scorenet_input=scorenet_input,
-                        scorenet_cfg=scorenet_cfg))
-            self.mlps.append(mlp)
-
-
-@SA_MODULES.register_module()
-class PAConvSAModule(PAConvSAModuleMSG):
-    r"""Point set abstraction module with single-scale grouping (SSG) used in
-    PAConv networks.
-
-    Replace the MLPs in `PointSAModule` with PAConv layers. See the `paper
-    <https://arxiv.org/abs/2103.14635>`_ for more details.
-    """
-
-    def __init__(self,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 num_point=None,
-                 radius=None,
-                 num_sample=None,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 normalize_xyz=False,
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[16, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
-        super(PAConvSAModule, self).__init__(
-            mlp_channels=[mlp_channels],
-            paconv_num_kernels=[paconv_num_kernels],
-            num_point=num_point,
-            radii=[radius],
-            sample_nums=[num_sample],
-            norm_cfg=norm_cfg,
-            use_xyz=use_xyz,
-            pool_mod=pool_mod,
-            fps_mod=fps_mod,
-            fps_sample_range_list=fps_sample_range_list,
-            normalize_xyz=normalize_xyz,
-            paconv_kernel_input=paconv_kernel_input,
-            scorenet_input=scorenet_input,
-            scorenet_cfg=scorenet_cfg)
-
-
-@SA_MODULES.register_module()
-class PAConvCUDASAModuleMSG(BasePointSAModule):
-    r"""Point set abstraction module with multi-scale grouping (MSG) used in
-    PAConv networks.
-
-    Replace the non CUDA version PAConv with CUDA implemented PAConv for
-    efficient computation. See the `paper <https://arxiv.org/abs/2103.14635>`_
-    for more details.
-    """
-
-    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 bias='auto',
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[8, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
-        super(PAConvCUDASAModuleMSG, self).__init__(
-            num_point=num_point,
-            radii=radii,
-            sample_nums=sample_nums,
-            mlp_channels=mlp_channels,
-            fps_mod=fps_mod,
-            fps_sample_range_list=fps_sample_range_list,
-            dilated_group=dilated_group,
-            use_xyz=use_xyz,
-            pool_mod=pool_mod,
-            normalize_xyz=normalize_xyz,
-            grouper_return_grouped_xyz=True,
-            grouper_return_grouped_idx=True)
-
-        assert len(paconv_num_kernels) == len(mlp_channels)
-        for i in range(len(mlp_channels)):
-            assert len(paconv_num_kernels[i]) == len(mlp_channels[i]) - 1, \
-                'PAConv number of kernel weights wrong'
-
-        # in PAConv, bias only exists in ScoreNet
-        scorenet_cfg['bias'] = bias
-
-        # we need to manually concat xyz for CUDA implemented PAConv
-        self.use_xyz = use_xyz
-
-        for i in range(len(self.mlp_channels)):
-            mlp_channel = self.mlp_channels[i]
-            if use_xyz:
-                mlp_channel[0] += 3
-
-            num_kernels = paconv_num_kernels[i]
-
-            # can't use `nn.Sequential` for PAConvCUDA because its input and
-            # output have different shapes
-            mlp = nn.ModuleList()
-            for i in range(len(mlp_channel) - 1):
-                mlp.append(
-                    PAConvCUDA(
-                        mlp_channel[i],
-                        mlp_channel[i + 1],
-                        num_kernels[i],
-                        norm_cfg=norm_cfg,
-                        kernel_input=paconv_kernel_input,
-                        scorenet_input=scorenet_input,
-                        scorenet_cfg=scorenet_cfg))
-            self.mlps.append(mlp)
-
-    def forward(
-        self,
-        points_xyz,
-        features=None,
-        indices=None,
-        target_xyz=None,
-    ):
-        """forward.
-
-        Args:
-            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            features (Tensor, optional): (B, C, N) features of each point.
-                Default: None.
-            indices (Tensor, optional): (B, num_point) Index of the features.
-                Default: None.
-            target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs.
-                Default: None.
-
-        Returns:
-            Tensor: (B, M, 3) where M is the number of points.
-                New features xyz.
-            Tensor: (B, M, sum_k(mlps[k][-1])) where M is the number
-                of points. New feature descriptors.
-            Tensor: (B, M) where M is the number of points.
-                Index of the features.
-        """
-        new_features_list = []
-
-        # sample points, (B, num_point, 3), (B, num_point)
-        new_xyz, indices = self._sample_points(points_xyz, features, indices,
-                                               target_xyz)
-
-        for i in range(len(self.groupers)):
-            xyz = points_xyz
-            new_features = features
-            for j in range(len(self.mlps[i])):
-                # we don't use grouped_features here to avoid large GPU memory
-                # _, (B, 3, num_point, nsample), (B, num_point, nsample)
-                _, grouped_xyz, grouped_idx = self.groupers[i](xyz, new_xyz,
-                                                               new_features)
-
-                # concat xyz as additional features
-                if self.use_xyz and j == 0:
-                    # (B, C+3, N)
-                    new_features = torch.cat(
-                        (points_xyz.permute(0, 2, 1), new_features), dim=1)
-
-                # (B, out_c, num_point, nsample)
-                grouped_new_features = self.mlps[i][j](
-                    (new_features, grouped_xyz, grouped_idx.long()))[0]
-
-                # different from PointNet++ and non CUDA version of PAConv
-                # CUDA version of PAConv needs to aggregate local features
-                # every time after it passes through a Conv layer
-                # in order to transform to valid input shape
-                # (B, out_c, num_point)
-                new_features = self._pool_features(grouped_new_features)
-
-                # constrain the points to be grouped for next PAConv layer
-                # because new_features only contains sampled centers now
-                # (B, num_point, 3)
-                xyz = new_xyz
-
-            new_features_list.append(new_features)
-
-        return new_xyz, torch.cat(new_features_list, dim=1), indices
-
-
-@SA_MODULES.register_module()
-class PAConvCUDASAModule(PAConvCUDASAModuleMSG):
-    r"""Point set abstraction module with single-scale grouping (SSG) used in
-    PAConv networks.
-
-    Replace the non CUDA version PAConv with CUDA implemented PAConv for
-    efficient computation. See the `paper <https://arxiv.org/abs/2103.14635>`_
-    for more details.
-    """
-
-    def __init__(self,
-                 mlp_channels,
-                 paconv_num_kernels,
-                 num_point=None,
-                 radius=None,
-                 num_sample=None,
-                 norm_cfg=dict(type='BN2d', momentum=0.1),
-                 use_xyz=True,
-                 pool_mod='max',
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 normalize_xyz=False,
-                 paconv_kernel_input='w_neighbor',
-                 scorenet_input='w_neighbor_dist',
-                 scorenet_cfg=dict(
-                     mlp_channels=[8, 16, 16],
-                     score_norm='softmax',
-                     temp_factor=1.0,
-                     last_bn=False)):
-        super(PAConvCUDASAModule, self).__init__(
-            mlp_channels=[mlp_channels],
-            paconv_num_kernels=[paconv_num_kernels],
-            num_point=num_point,
-            radii=[radius],
-            sample_nums=[num_sample],
-            norm_cfg=norm_cfg,
-            use_xyz=use_xyz,
-            pool_mod=pool_mod,
-            fps_mod=fps_mod,
-            fps_sample_range_list=fps_sample_range_list,
-            normalize_xyz=normalize_xyz,
-            paconv_kernel_input=paconv_kernel_input,
-            scorenet_input=scorenet_input,
-            scorenet_cfg=scorenet_cfg)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn as nn
+
+from mmdet3d.ops import PAConv, PAConvCUDA
+from .builder import SA_MODULES
+from .point_sa_module import BasePointSAModule
+
+
+@SA_MODULES.register_module()
+class PAConvSAModuleMSG(BasePointSAModule):
+    r"""Point set abstraction module with multi-scale grouping (MSG) used in
+    PAConv networks.
+
+    Replace the MLPs in `PointSAModuleMSG` with PAConv layers.
+    See the `paper <https://arxiv.org/abs/2103.14635>`_ for more details.
+
+    Args:
+        paconv_num_kernels (list[list[int]]): Number of kernel weights in the
+            weight banks of each layer's PAConv.
+        paconv_kernel_input (str, optional): Input features to be multiplied
+            with kernel weights. Can be 'identity' or 'w_neighbor'.
+            Defaults to 'w_neighbor'.
+        scorenet_input (str, optional): Type of the input to ScoreNet.
+            Defaults to 'w_neighbor_dist'. Can be the following values:
+
+            - 'identity': Use xyz coordinates as input.
+            - 'w_neighbor': Use xyz coordinates and the difference with center
+                points as input.
+            - 'w_neighbor_dist': Use xyz coordinates, the difference with
+                center points and the Euclidean distance as input.
+
+        scorenet_cfg (dict, optional): Config of the ScoreNet module, which
+            may contain the following keys and values:
+
+            - mlp_channels (List[int]): Hidden units of MLPs.
+            - score_norm (str): Normalization function of output scores.
+                Can be 'softmax', 'sigmoid' or 'identity'.
+            - temp_factor (float): Temperature factor to scale the output
+                scores before softmax.
+            - last_bn (bool): Whether to use BN on the last output of mlps.
+    """
+
+    def __init__(self,
+                 num_point,
+                 radii,
+                 sample_nums,
+                 mlp_channels,
+                 paconv_num_kernels,
+                 fps_mod=['D-FPS'],
+                 fps_sample_range_list=[-1],
+                 dilated_group=False,
+                 norm_cfg=dict(type='BN2d', momentum=0.1),
+                 use_xyz=True,
+                 pool_mod='max',
+                 normalize_xyz=False,
+                 bias='auto',
+                 paconv_kernel_input='w_neighbor',
+                 scorenet_input='w_neighbor_dist',
+                 scorenet_cfg=dict(
+                     mlp_channels=[16, 16, 16],
+                     score_norm='softmax',
+                     temp_factor=1.0,
+                     last_bn=False)):
+        super(PAConvSAModuleMSG, self).__init__(
+            num_point=num_point,
+            radii=radii,
+            sample_nums=sample_nums,
+            mlp_channels=mlp_channels,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            dilated_group=dilated_group,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            normalize_xyz=normalize_xyz,
+            grouper_return_grouped_xyz=True)
+
+        assert len(paconv_num_kernels) == len(mlp_channels)
+        for i in range(len(mlp_channels)):
+            assert len(paconv_num_kernels[i]) == len(mlp_channels[i]) - 1, \
+                'PAConv number of kernel weights wrong'
+
+        # in PAConv, bias only exists in ScoreNet
+        scorenet_cfg['bias'] = bias
+
+        for i in range(len(self.mlp_channels)):
+            mlp_channel = self.mlp_channels[i]
+            if use_xyz:
+                mlp_channel[0] += 3
+
+            num_kernels = paconv_num_kernels[i]
+
+            mlp = nn.Sequential()
+            for i in range(len(mlp_channel) - 1):
+                mlp.add_module(
+                    f'layer{i}',
+                    PAConv(
+                        mlp_channel[i],
+                        mlp_channel[i + 1],
+                        num_kernels[i],
+                        norm_cfg=norm_cfg,
+                        kernel_input=paconv_kernel_input,
+                        scorenet_input=scorenet_input,
+                        scorenet_cfg=scorenet_cfg))
+            self.mlps.append(mlp)
+
+
+@SA_MODULES.register_module()
+class PAConvSAModule(PAConvSAModuleMSG):
+    r"""Point set abstraction module with single-scale grouping (SSG) used in
+    PAConv networks.
+
+    Replace the MLPs in `PointSAModule` with PAConv layers. See the `paper
+    <https://arxiv.org/abs/2103.14635>`_ for more details.
+    """
+
+    def __init__(self,
+                 mlp_channels,
+                 paconv_num_kernels,
+                 num_point=None,
+                 radius=None,
+                 num_sample=None,
+                 norm_cfg=dict(type='BN2d', momentum=0.1),
+                 use_xyz=True,
+                 pool_mod='max',
+                 fps_mod=['D-FPS'],
+                 fps_sample_range_list=[-1],
+                 normalize_xyz=False,
+                 paconv_kernel_input='w_neighbor',
+                 scorenet_input='w_neighbor_dist',
+                 scorenet_cfg=dict(
+                     mlp_channels=[16, 16, 16],
+                     score_norm='softmax',
+                     temp_factor=1.0,
+                     last_bn=False)):
+        super(PAConvSAModule, self).__init__(
+            mlp_channels=[mlp_channels],
+            paconv_num_kernels=[paconv_num_kernels],
+            num_point=num_point,
+            radii=[radius],
+            sample_nums=[num_sample],
+            norm_cfg=norm_cfg,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            normalize_xyz=normalize_xyz,
+            paconv_kernel_input=paconv_kernel_input,
+            scorenet_input=scorenet_input,
+            scorenet_cfg=scorenet_cfg)
+
+
+@SA_MODULES.register_module()
+class PAConvCUDASAModuleMSG(BasePointSAModule):
+    r"""Point set abstraction module with multi-scale grouping (MSG) used in
+    PAConv networks.
+
+    Replace the non CUDA version PAConv with CUDA implemented PAConv for
+    efficient computation. See the `paper <https://arxiv.org/abs/2103.14635>`_
+    for more details.
+    """
+
+    def __init__(self,
+                 num_point,
+                 radii,
+                 sample_nums,
+                 mlp_channels,
+                 paconv_num_kernels,
+                 fps_mod=['D-FPS'],
+                 fps_sample_range_list=[-1],
+                 dilated_group=False,
+                 norm_cfg=dict(type='BN2d', momentum=0.1),
+                 use_xyz=True,
+                 pool_mod='max',
+                 normalize_xyz=False,
+                 bias='auto',
+                 paconv_kernel_input='w_neighbor',
+                 scorenet_input='w_neighbor_dist',
+                 scorenet_cfg=dict(
+                     mlp_channels=[8, 16, 16],
+                     score_norm='softmax',
+                     temp_factor=1.0,
+                     last_bn=False)):
+        super(PAConvCUDASAModuleMSG, self).__init__(
+            num_point=num_point,
+            radii=radii,
+            sample_nums=sample_nums,
+            mlp_channels=mlp_channels,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            dilated_group=dilated_group,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            normalize_xyz=normalize_xyz,
+            grouper_return_grouped_xyz=True,
+            grouper_return_grouped_idx=True)
+
+        assert len(paconv_num_kernels) == len(mlp_channels)
+        for i in range(len(mlp_channels)):
+            assert len(paconv_num_kernels[i]) == len(mlp_channels[i]) - 1, \
+                'PAConv number of kernel weights wrong'
+
+        # in PAConv, bias only exists in ScoreNet
+        scorenet_cfg['bias'] = bias
+
+        # we need to manually concat xyz for CUDA implemented PAConv
+        self.use_xyz = use_xyz
+
+        for i in range(len(self.mlp_channels)):
+            mlp_channel = self.mlp_channels[i]
+            if use_xyz:
+                mlp_channel[0] += 3
+
+            num_kernels = paconv_num_kernels[i]
+
+            # can't use `nn.Sequential` for PAConvCUDA because its input and
+            # output have different shapes
+            mlp = nn.ModuleList()
+            for i in range(len(mlp_channel) - 1):
+                mlp.append(
+                    PAConvCUDA(
+                        mlp_channel[i],
+                        mlp_channel[i + 1],
+                        num_kernels[i],
+                        norm_cfg=norm_cfg,
+                        kernel_input=paconv_kernel_input,
+                        scorenet_input=scorenet_input,
+                        scorenet_cfg=scorenet_cfg))
+            self.mlps.append(mlp)
+
+    def forward(
+        self,
+        points_xyz,
+        features=None,
+        indices=None,
+        target_xyz=None,
+    ):
+        """forward.
+
+        Args:
+            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            features (Tensor, optional): (B, C, N) features of each point.
+                Default: None.
+            indices (Tensor, optional): (B, num_point) Index of the features.
+                Default: None.
+            target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs.
+                Default: None.
+
+        Returns:
+            Tensor: (B, M, 3) where M is the number of points.
+                New features xyz.
+            Tensor: (B, M, sum_k(mlps[k][-1])) where M is the number
+                of points. New feature descriptors.
+            Tensor: (B, M) where M is the number of points.
+                Index of the features.
+        """
+        new_features_list = []
+
+        # sample points, (B, num_point, 3), (B, num_point)
+        new_xyz, indices = self._sample_points(points_xyz, features, indices,
+                                               target_xyz)
+
+        for i in range(len(self.groupers)):
+            xyz = points_xyz
+            new_features = features
+            for j in range(len(self.mlps[i])):
+                # we don't use grouped_features here to avoid large GPU memory
+                # _, (B, 3, num_point, nsample), (B, num_point, nsample)
+                _, grouped_xyz, grouped_idx = self.groupers[i](xyz, new_xyz,
+                                                               new_features)
+
+                # concat xyz as additional features
+                if self.use_xyz and j == 0:
+                    # (B, C+3, N)
+                    new_features = torch.cat(
+                        (points_xyz.permute(0, 2, 1), new_features), dim=1)
+
+                # (B, out_c, num_point, nsample)
+                grouped_new_features = self.mlps[i][j](
+                    (new_features, grouped_xyz, grouped_idx.long()))[0]
+
+                # different from PointNet++ and non CUDA version of PAConv
+                # CUDA version of PAConv needs to aggregate local features
+                # every time after it passes through a Conv layer
+                # in order to transform to valid input shape
+                # (B, out_c, num_point)
+                new_features = self._pool_features(grouped_new_features)
+
+                # constrain the points to be grouped for next PAConv layer
+                # because new_features only contains sampled centers now
+                # (B, num_point, 3)
+                xyz = new_xyz
+
+            new_features_list.append(new_features)
+
+        return new_xyz, torch.cat(new_features_list, dim=1), indices
+
+
+@SA_MODULES.register_module()
+class PAConvCUDASAModule(PAConvCUDASAModuleMSG):
+    r"""Point set abstraction module with single-scale grouping (SSG) used in
+    PAConv networks.
+
+    Replace the non CUDA version PAConv with CUDA implemented PAConv for
+    efficient computation. See the `paper <https://arxiv.org/abs/2103.14635>`_
+    for more details.
+    """
+
+    def __init__(self,
+                 mlp_channels,
+                 paconv_num_kernels,
+                 num_point=None,
+                 radius=None,
+                 num_sample=None,
+                 norm_cfg=dict(type='BN2d', momentum=0.1),
+                 use_xyz=True,
+                 pool_mod='max',
+                 fps_mod=['D-FPS'],
+                 fps_sample_range_list=[-1],
+                 normalize_xyz=False,
+                 paconv_kernel_input='w_neighbor',
+                 scorenet_input='w_neighbor_dist',
+                 scorenet_cfg=dict(
+                     mlp_channels=[8, 16, 16],
+                     score_norm='softmax',
+                     temp_factor=1.0,
+                     last_bn=False)):
+        super(PAConvCUDASAModule, self).__init__(
+            mlp_channels=[mlp_channels],
+            paconv_num_kernels=[paconv_num_kernels],
+            num_point=num_point,
+            radii=[radius],
+            sample_nums=[num_sample],
+            norm_cfg=norm_cfg,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            normalize_xyz=normalize_xyz,
+            paconv_kernel_input=paconv_kernel_input,
+            scorenet_input=scorenet_input,
+            scorenet_cfg=scorenet_cfg)
diff --git a/mmdet3d/ops/pointnet_modules/point_fp_module.py b/mmdet3d/ops/pointnet_modules/point_fp_module.py
index 1bc833e..3b97065 100644
--- a/mmdet3d/ops/pointnet_modules/point_fp_module.py
+++ b/mmdet3d/ops/pointnet_modules/point_fp_module.py
@@ -1,79 +1,79 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
-
-import torch
-from mmcv.cnn import ConvModule
-from mmcv.ops import three_interpolate, three_nn
-from mmcv.runner import BaseModule, force_fp32
-from torch import nn as nn
-
-
-class PointFPModule(BaseModule):
-    """Point feature propagation module used in PointNets.
-
-    Propagate the features from one set to another.
-
-    Args:
-        mlp_channels (list[int]): List of mlp channels.
-        norm_cfg (dict, optional): Type of normalization method.
-            Default: dict(type='BN2d').
-    """
-
-    def __init__(self,
-                 mlp_channels: List[int],
-                 norm_cfg: dict = dict(type='BN2d'),
-                 init_cfg=None):
-        super().__init__(init_cfg=init_cfg)
-        self.fp16_enabled = False
-        self.mlps = nn.Sequential()
-        for i in range(len(mlp_channels) - 1):
-            self.mlps.add_module(
-                f'layer{i}',
-                ConvModule(
-                    mlp_channels[i],
-                    mlp_channels[i + 1],
-                    kernel_size=(1, 1),
-                    stride=(1, 1),
-                    conv_cfg=dict(type='Conv2d'),
-                    norm_cfg=norm_cfg))
-
-    @force_fp32()
-    def forward(self, target: torch.Tensor, source: torch.Tensor,
-                target_feats: torch.Tensor,
-                source_feats: torch.Tensor) -> torch.Tensor:
-        """forward.
-
-        Args:
-            target (Tensor): (B, n, 3) tensor of the xyz positions of
-                the target features.
-            source (Tensor): (B, m, 3) tensor of the xyz positions of
-                the source features.
-            target_feats (Tensor): (B, C1, n) tensor of the features to be
-                propagated to.
-            source_feats (Tensor): (B, C2, m) tensor of features
-                to be propagated.
-
-        Return:
-            Tensor: (B, M, N) M = mlp[-1], tensor of the target features.
-        """
-        if source is not None:
-            dist, idx = three_nn(target, source)
-            dist_reciprocal = 1.0 / (dist + 1e-8)
-            norm = torch.sum(dist_reciprocal, dim=2, keepdim=True)
-            weight = dist_reciprocal / norm
-
-            interpolated_feats = three_interpolate(source_feats, idx, weight)
-        else:
-            interpolated_feats = source_feats.expand(*source_feats.size()[0:2],
-                                                     target.size(1))
-
-        if target_feats is not None:
-            new_features = torch.cat([interpolated_feats, target_feats],
-                                     dim=1)  # (B, C2 + C1, n)
-        else:
-            new_features = interpolated_feats
-
-        new_features = new_features.unsqueeze(-1)
-        new_features = self.mlps(new_features)
-
-        return new_features.squeeze(-1)
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.ops import three_interpolate, three_nn
+from mmcv.runner import BaseModule, force_fp32
+from torch import nn as nn
+
+
+class PointFPModule(BaseModule):
+    """Point feature propagation module used in PointNets.
+
+    Propagate the features from one set to another.
+
+    Args:
+        mlp_channels (list[int]): List of mlp channels.
+        norm_cfg (dict, optional): Type of normalization method.
+            Default: dict(type='BN2d').
+    """
+
+    def __init__(self,
+                 mlp_channels: List[int],
+                 norm_cfg: dict = dict(type='BN2d'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.fp16_enabled = False
+        self.mlps = nn.Sequential()
+        for i in range(len(mlp_channels) - 1):
+            self.mlps.add_module(
+                f'layer{i}',
+                ConvModule(
+                    mlp_channels[i],
+                    mlp_channels[i + 1],
+                    kernel_size=(1, 1),
+                    stride=(1, 1),
+                    conv_cfg=dict(type='Conv2d'),
+                    norm_cfg=norm_cfg))
+
+    @force_fp32()
+    def forward(self, target: torch.Tensor, source: torch.Tensor,
+                target_feats: torch.Tensor,
+                source_feats: torch.Tensor) -> torch.Tensor:
+        """forward.
+
+        Args:
+            target (Tensor): (B, n, 3) tensor of the xyz positions of
+                the target features.
+            source (Tensor): (B, m, 3) tensor of the xyz positions of
+                the source features.
+            target_feats (Tensor): (B, C1, n) tensor of the features to be
+                propagated to.
+            source_feats (Tensor): (B, C2, m) tensor of features
+                to be propagated.
+
+        Return:
+            Tensor: (B, M, N) M = mlp[-1], tensor of the target features.
+        """
+        if source is not None:
+            dist, idx = three_nn(target, source)
+            dist_reciprocal = 1.0 / (dist + 1e-8)
+            norm = torch.sum(dist_reciprocal, dim=2, keepdim=True)
+            weight = dist_reciprocal / norm
+
+            interpolated_feats = three_interpolate(source_feats, idx, weight)
+        else:
+            interpolated_feats = source_feats.expand(*source_feats.size()[0:2],
+                                                     target.size(1))
+
+        if target_feats is not None:
+            new_features = torch.cat([interpolated_feats, target_feats],
+                                     dim=1)  # (B, C2 + C1, n)
+        else:
+            new_features = interpolated_feats
+
+        new_features = new_features.unsqueeze(-1)
+        new_features = self.mlps(new_features)
+
+        return new_features.squeeze(-1)
diff --git a/mmdet3d/ops/pointnet_modules/point_sa_module.py b/mmdet3d/ops/pointnet_modules/point_sa_module.py
index e33377f..8fa854f 100644
--- a/mmdet3d/ops/pointnet_modules/point_sa_module.py
+++ b/mmdet3d/ops/pointnet_modules/point_sa_module.py
@@ -1,352 +1,352 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-from mmcv.cnn import ConvModule
-from mmcv.ops import GroupAll
-from mmcv.ops import PointsSampler as Points_Sampler
-from mmcv.ops import QueryAndGroup, gather_points
-from torch import nn as nn
-from torch.nn import functional as F
-
-from mmdet3d.ops import PAConv
-from .builder import SA_MODULES
-
-
-class BasePointSAModule(nn.Module):
-    """Base module for point set abstraction module used in PointNets.
-
-    Args:
-        num_point (int): Number of points.
-        radii (list[float]): List of radius in each ball query.
-        sample_nums (list[int]): Number of samples in each ball query.
-        mlp_channels (list[list[int]]): Specify of the pointnet before
-            the global pooling for each scale.
-        fps_mod (list[str], optional): Type of FPS method, valid mod
-            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
-            F-FPS: using feature distances for FPS.
-            D-FPS: using Euclidean distances of points for FPS.
-            FS: using F-FPS and D-FPS simultaneously.
-        fps_sample_range_list (list[int], optional):
-            Range of points to apply FPS. Default: [-1].
-        dilated_group (bool, optional): Whether to use dilated ball query.
-            Default: False.
-        use_xyz (bool, optional): Whether to use xyz.
-            Default: True.
-        pool_mod (str, optional): Type of pooling method.
-            Default: 'max_pool'.
-        normalize_xyz (bool, optional): Whether to normalize local XYZ
-            with radius. Default: False.
-        grouper_return_grouped_xyz (bool, optional): Whether to return
-            grouped xyz in `QueryAndGroup`. Defaults to False.
-        grouper_return_grouped_idx (bool, optional): Whether to return
-            grouped idx in `QueryAndGroup`. Defaults to False.
-    """
-
-    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 grouper_return_grouped_xyz=False,
-                 grouper_return_grouped_idx=False):
-        super(BasePointSAModule, self).__init__()
-
-        assert len(radii) == len(sample_nums) == len(mlp_channels)
-        assert pool_mod in ['max', 'avg']
-        assert isinstance(fps_mod, list) or isinstance(fps_mod, tuple)
-        assert isinstance(fps_sample_range_list, list) or isinstance(
-            fps_sample_range_list, tuple)
-        assert len(fps_mod) == len(fps_sample_range_list)
-
-        if isinstance(mlp_channels, tuple):
-            mlp_channels = list(map(list, mlp_channels))
-        self.mlp_channels = mlp_channels
-
-        if isinstance(num_point, int):
-            self.num_point = [num_point]
-        elif isinstance(num_point, list) or isinstance(num_point, tuple):
-            self.num_point = num_point
-        elif num_point is None:
-            self.num_point = None
-        else:
-            raise NotImplementedError('Error type of num_point!')
-
-        self.pool_mod = pool_mod
-        self.groupers = nn.ModuleList()
-        self.mlps = nn.ModuleList()
-        self.fps_mod_list = fps_mod
-        self.fps_sample_range_list = fps_sample_range_list
-
-        if self.num_point is not None:
-            self.points_sampler = Points_Sampler(self.num_point,
-                                                 self.fps_mod_list,
-                                                 self.fps_sample_range_list)
-        else:
-            self.points_sampler = None
-
-        for i in range(len(radii)):
-            radius = radii[i]
-            sample_num = sample_nums[i]
-            if num_point is not None:
-                if dilated_group and i != 0:
-                    min_radius = radii[i - 1]
-                else:
-                    min_radius = 0
-                grouper = QueryAndGroup(
-                    radius,
-                    sample_num,
-                    min_radius=min_radius,
-                    use_xyz=use_xyz,
-                    normalize_xyz=normalize_xyz,
-                    return_grouped_xyz=grouper_return_grouped_xyz,
-                    return_grouped_idx=grouper_return_grouped_idx)
-            else:
-                grouper = GroupAll(use_xyz)
-            self.groupers.append(grouper)
-
-    def _sample_points(self, points_xyz, features, indices, target_xyz):
-        """Perform point sampling based on inputs.
-
-        If `indices` is specified, directly sample corresponding points.
-        Else if `target_xyz` is specified, use is as sampled points.
-        Otherwise sample points using `self.points_sampler`.
-
-        Args:
-            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            features (Tensor): (B, C, N) features of each point.
-            indices (Tensor): (B, num_point) Index of the features.
-            target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs.
-
-        Returns:
-            Tensor: (B, num_point, 3) sampled xyz coordinates of points.
-            Tensor: (B, num_point) sampled points' index.
-        """
-        xyz_flipped = points_xyz.transpose(1, 2).contiguous()
-        if indices is not None:
-            assert (indices.shape[1] == self.num_point[0])
-            new_xyz = gather_points(xyz_flipped, indices).transpose(
-                1, 2).contiguous() if self.num_point is not None else None
-        elif target_xyz is not None:
-            new_xyz = target_xyz.contiguous()
-        else:
-            if self.num_point is not None:
-                indices = self.points_sampler(points_xyz, features)
-                new_xyz = gather_points(xyz_flipped,
-                                        indices).transpose(1, 2).contiguous()
-            else:
-                new_xyz = None
-
-        return new_xyz, indices
-
-    def _pool_features(self, features):
-        """Perform feature aggregation using pooling operation.
-
-        Args:
-            features (torch.Tensor): (B, C, N, K)
-                Features of locally grouped points before pooling.
-
-        Returns:
-            torch.Tensor: (B, C, N)
-                Pooled features aggregating local information.
-        """
-        if self.pool_mod == 'max':
-            # (B, C, N, 1)
-            new_features = F.max_pool2d(
-                features, kernel_size=[1, features.size(3)])
-        elif self.pool_mod == 'avg':
-            # (B, C, N, 1)
-            new_features = F.avg_pool2d(
-                features, kernel_size=[1, features.size(3)])
-        else:
-            raise NotImplementedError
-
-        return new_features.squeeze(-1).contiguous()
-
-    def forward(
-        self,
-        points_xyz,
-        features=None,
-        indices=None,
-        target_xyz=None,
-    ):
-        """forward.
-
-        Args:
-            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            features (Tensor, optional): (B, C, N) features of each point.
-                Default: None.
-            indices (Tensor, optional): (B, num_point) Index of the features.
-                Default: None.
-            target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs.
-                Default: None.
-
-        Returns:
-            Tensor: (B, M, 3) where M is the number of points.
-                New features xyz.
-            Tensor: (B, M, sum_k(mlps[k][-1])) where M is the number
-                of points. New feature descriptors.
-            Tensor: (B, M) where M is the number of points.
-                Index of the features.
-        """
-        new_features_list = []
-
-        # sample points, (B, num_point, 3), (B, num_point)
-        new_xyz, indices = self._sample_points(points_xyz, features, indices,
-                                               target_xyz)
-
-        for i in range(len(self.groupers)):
-            # grouped_results may contain:
-            # - grouped_features: (B, C, num_point, nsample)
-            # - grouped_xyz: (B, 3, num_point, nsample)
-            # - grouped_idx: (B, num_point, nsample)
-            grouped_results = self.groupers[i](points_xyz, new_xyz, features)
-
-            # (B, mlp[-1], num_point, nsample)
-            new_features = self.mlps[i](grouped_results)
-
-            # this is a bit hack because PAConv outputs two values
-            # we take the first one as feature
-            if isinstance(self.mlps[i][0], PAConv):
-                assert isinstance(new_features, tuple)
-                new_features = new_features[0]
-
-            # (B, mlp[-1], num_point)
-            new_features = self._pool_features(new_features)
-            new_features_list.append(new_features)
-
-        return new_xyz, torch.cat(new_features_list, dim=1), indices
-
-
-@SA_MODULES.register_module()
-class PointSAModuleMSG(BasePointSAModule):
-    """Point set abstraction module with multi-scale grouping (MSG) used in
-    PointNets.
-
-    Args:
-        num_point (int): Number of points.
-        radii (list[float]): List of radius in each ball query.
-        sample_nums (list[int]): Number of samples in each ball query.
-        mlp_channels (list[list[int]]): Specify of the pointnet before
-            the global pooling for each scale.
-        fps_mod (list[str], optional): Type of FPS method, valid mod
-            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
-            F-FPS: using feature distances for FPS.
-            D-FPS: using Euclidean distances of points for FPS.
-            FS: using F-FPS and D-FPS simultaneously.
-        fps_sample_range_list (list[int], optional): Range of points to
-            apply FPS. Default: [-1].
-        dilated_group (bool, optional): Whether to use dilated ball query.
-            Default: False.
-        norm_cfg (dict, optional): Type of normalization method.
-            Default: dict(type='BN2d').
-        use_xyz (bool, optional): Whether to use xyz.
-            Default: True.
-        pool_mod (str, optional): Type of pooling method.
-            Default: 'max_pool'.
-        normalize_xyz (bool, optional): Whether to normalize local XYZ
-            with radius. Default: False.
-        bias (bool | str, optional): If specified as `auto`, it will be
-            decided by `norm_cfg`. `bias` will be set as True if
-            `norm_cfg` is None, otherwise False. Default: 'auto'.
-    """
-
-    def __init__(self,
-                 num_point,
-                 radii,
-                 sample_nums,
-                 mlp_channels,
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 dilated_group=False,
-                 norm_cfg=dict(type='BN2d'),
-                 use_xyz=True,
-                 pool_mod='max',
-                 normalize_xyz=False,
-                 bias='auto'):
-        super(PointSAModuleMSG, self).__init__(
-            num_point=num_point,
-            radii=radii,
-            sample_nums=sample_nums,
-            mlp_channels=mlp_channels,
-            fps_mod=fps_mod,
-            fps_sample_range_list=fps_sample_range_list,
-            dilated_group=dilated_group,
-            use_xyz=use_xyz,
-            pool_mod=pool_mod,
-            normalize_xyz=normalize_xyz)
-
-        for i in range(len(self.mlp_channels)):
-            mlp_channel = self.mlp_channels[i]
-            if use_xyz:
-                mlp_channel[0] += 3
-
-            mlp = nn.Sequential()
-            for i in range(len(mlp_channel) - 1):
-                mlp.add_module(
-                    f'layer{i}',
-                    ConvModule(
-                        mlp_channel[i],
-                        mlp_channel[i + 1],
-                        kernel_size=(1, 1),
-                        stride=(1, 1),
-                        conv_cfg=dict(type='Conv2d'),
-                        norm_cfg=norm_cfg,
-                        bias=bias))
-            self.mlps.append(mlp)
-
-
-@SA_MODULES.register_module()
-class PointSAModule(PointSAModuleMSG):
-    """Point set abstraction module with single-scale grouping (SSG) used in
-    PointNets.
-
-    Args:
-        mlp_channels (list[int]): Specify of the pointnet before
-            the global pooling for each scale.
-        num_point (int, optional): Number of points.
-            Default: None.
-        radius (float, optional): Radius to group with.
-            Default: None.
-        num_sample (int, optional): Number of samples in each ball query.
-            Default: None.
-        norm_cfg (dict, optional): Type of normalization method.
-            Default: dict(type='BN2d').
-        use_xyz (bool, optional): Whether to use xyz.
-            Default: True.
-        pool_mod (str, optional): Type of pooling method.
-            Default: 'max_pool'.
-        fps_mod (list[str], optional): Type of FPS method, valid mod
-            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
-        fps_sample_range_list (list[int], optional): Range of points
-            to apply FPS. Default: [-1].
-        normalize_xyz (bool, optional): Whether to normalize local XYZ
-            with radius. Default: False.
-    """
-
-    def __init__(self,
-                 mlp_channels,
-                 num_point=None,
-                 radius=None,
-                 num_sample=None,
-                 norm_cfg=dict(type='BN2d'),
-                 use_xyz=True,
-                 pool_mod='max',
-                 fps_mod=['D-FPS'],
-                 fps_sample_range_list=[-1],
-                 normalize_xyz=False):
-        super(PointSAModule, self).__init__(
-            mlp_channels=[mlp_channels],
-            num_point=num_point,
-            radii=[radius],
-            sample_nums=[num_sample],
-            norm_cfg=norm_cfg,
-            use_xyz=use_xyz,
-            pool_mod=pool_mod,
-            fps_mod=fps_mod,
-            fps_sample_range_list=fps_sample_range_list,
-            normalize_xyz=normalize_xyz)
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.cnn import ConvModule
+from mmcv.ops import GroupAll
+from mmcv.ops import PointsSampler as Points_Sampler
+from mmcv.ops import QueryAndGroup, gather_points
+from torch import nn as nn
+from torch.nn import functional as F
+
+from mmdet3d.ops import PAConv
+from .builder import SA_MODULES
+
+
+class BasePointSAModule(nn.Module):
+    """Base module for point set abstraction module used in PointNets.
+
+    Args:
+        num_point (int): Number of points.
+        radii (list[float]): List of radius in each ball query.
+        sample_nums (list[int]): Number of samples in each ball query.
+        mlp_channels (list[list[int]]): Specify of the pointnet before
+            the global pooling for each scale.
+        fps_mod (list[str], optional): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
+            F-FPS: using feature distances for FPS.
+            D-FPS: using Euclidean distances of points for FPS.
+            FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (list[int], optional):
+            Range of points to apply FPS. Default: [-1].
+        dilated_group (bool, optional): Whether to use dilated ball query.
+            Default: False.
+        use_xyz (bool, optional): Whether to use xyz.
+            Default: True.
+        pool_mod (str, optional): Type of pooling method.
+            Default: 'max_pool'.
+        normalize_xyz (bool, optional): Whether to normalize local XYZ
+            with radius. Default: False.
+        grouper_return_grouped_xyz (bool, optional): Whether to return
+            grouped xyz in `QueryAndGroup`. Defaults to False.
+        grouper_return_grouped_idx (bool, optional): Whether to return
+            grouped idx in `QueryAndGroup`. Defaults to False.
+    """
+
+    def __init__(self,
+                 num_point,
+                 radii,
+                 sample_nums,
+                 mlp_channels,
+                 fps_mod=['D-FPS'],
+                 fps_sample_range_list=[-1],
+                 dilated_group=False,
+                 use_xyz=True,
+                 pool_mod='max',
+                 normalize_xyz=False,
+                 grouper_return_grouped_xyz=False,
+                 grouper_return_grouped_idx=False):
+        super(BasePointSAModule, self).__init__()
+
+        assert len(radii) == len(sample_nums) == len(mlp_channels)
+        assert pool_mod in ['max', 'avg']
+        assert isinstance(fps_mod, list) or isinstance(fps_mod, tuple)
+        assert isinstance(fps_sample_range_list, list) or isinstance(
+            fps_sample_range_list, tuple)
+        assert len(fps_mod) == len(fps_sample_range_list)
+
+        if isinstance(mlp_channels, tuple):
+            mlp_channels = list(map(list, mlp_channels))
+        self.mlp_channels = mlp_channels
+
+        if isinstance(num_point, int):
+            self.num_point = [num_point]
+        elif isinstance(num_point, list) or isinstance(num_point, tuple):
+            self.num_point = num_point
+        elif num_point is None:
+            self.num_point = None
+        else:
+            raise NotImplementedError('Error type of num_point!')
+
+        self.pool_mod = pool_mod
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        self.fps_mod_list = fps_mod
+        self.fps_sample_range_list = fps_sample_range_list
+
+        if self.num_point is not None:
+            self.points_sampler = Points_Sampler(self.num_point,
+                                                 self.fps_mod_list,
+                                                 self.fps_sample_range_list)
+        else:
+            self.points_sampler = None
+
+        for i in range(len(radii)):
+            radius = radii[i]
+            sample_num = sample_nums[i]
+            if num_point is not None:
+                if dilated_group and i != 0:
+                    min_radius = radii[i - 1]
+                else:
+                    min_radius = 0
+                grouper = QueryAndGroup(
+                    radius,
+                    sample_num,
+                    min_radius=min_radius,
+                    use_xyz=use_xyz,
+                    normalize_xyz=normalize_xyz,
+                    return_grouped_xyz=grouper_return_grouped_xyz,
+                    return_grouped_idx=grouper_return_grouped_idx)
+            else:
+                grouper = GroupAll(use_xyz)
+            self.groupers.append(grouper)
+
+    def _sample_points(self, points_xyz, features, indices, target_xyz):
+        """Perform point sampling based on inputs.
+
+        If `indices` is specified, directly sample corresponding points.
+        Else if `target_xyz` is specified, use is as sampled points.
+        Otherwise sample points using `self.points_sampler`.
+
+        Args:
+            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            features (Tensor): (B, C, N) features of each point.
+            indices (Tensor): (B, num_point) Index of the features.
+            target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs.
+
+        Returns:
+            Tensor: (B, num_point, 3) sampled xyz coordinates of points.
+            Tensor: (B, num_point) sampled points' index.
+        """
+        xyz_flipped = points_xyz.transpose(1, 2).contiguous()
+        if indices is not None:
+            assert (indices.shape[1] == self.num_point[0])
+            new_xyz = gather_points(xyz_flipped, indices).transpose(
+                1, 2).contiguous() if self.num_point is not None else None
+        elif target_xyz is not None:
+            new_xyz = target_xyz.contiguous()
+        else:
+            if self.num_point is not None:
+                indices = self.points_sampler(points_xyz, features)
+                new_xyz = gather_points(xyz_flipped,
+                                        indices).transpose(1, 2).contiguous()
+            else:
+                new_xyz = None
+
+        return new_xyz, indices
+
+    def _pool_features(self, features):
+        """Perform feature aggregation using pooling operation.
+
+        Args:
+            features (torch.Tensor): (B, C, N, K)
+                Features of locally grouped points before pooling.
+
+        Returns:
+            torch.Tensor: (B, C, N)
+                Pooled features aggregating local information.
+        """
+        if self.pool_mod == 'max':
+            # (B, C, N, 1)
+            new_features = F.max_pool2d(
+                features, kernel_size=[1, features.size(3)])
+        elif self.pool_mod == 'avg':
+            # (B, C, N, 1)
+            new_features = F.avg_pool2d(
+                features, kernel_size=[1, features.size(3)])
+        else:
+            raise NotImplementedError
+
+        return new_features.squeeze(-1).contiguous()
+
+    def forward(
+        self,
+        points_xyz,
+        features=None,
+        indices=None,
+        target_xyz=None,
+    ):
+        """forward.
+
+        Args:
+            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            features (Tensor, optional): (B, C, N) features of each point.
+                Default: None.
+            indices (Tensor, optional): (B, num_point) Index of the features.
+                Default: None.
+            target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs.
+                Default: None.
+
+        Returns:
+            Tensor: (B, M, 3) where M is the number of points.
+                New features xyz.
+            Tensor: (B, M, sum_k(mlps[k][-1])) where M is the number
+                of points. New feature descriptors.
+            Tensor: (B, M) where M is the number of points.
+                Index of the features.
+        """
+        new_features_list = []
+
+        # sample points, (B, num_point, 3), (B, num_point)
+        new_xyz, indices = self._sample_points(points_xyz, features, indices,
+                                               target_xyz)
+
+        for i in range(len(self.groupers)):
+            # grouped_results may contain:
+            # - grouped_features: (B, C, num_point, nsample)
+            # - grouped_xyz: (B, 3, num_point, nsample)
+            # - grouped_idx: (B, num_point, nsample)
+            grouped_results = self.groupers[i](points_xyz, new_xyz, features)
+
+            # (B, mlp[-1], num_point, nsample)
+            new_features = self.mlps[i](grouped_results)
+
+            # this is a bit hack because PAConv outputs two values
+            # we take the first one as feature
+            if isinstance(self.mlps[i][0], PAConv):
+                assert isinstance(new_features, tuple)
+                new_features = new_features[0]
+
+            # (B, mlp[-1], num_point)
+            new_features = self._pool_features(new_features)
+            new_features_list.append(new_features)
+
+        return new_xyz, torch.cat(new_features_list, dim=1), indices
+
+
+@SA_MODULES.register_module()
+class PointSAModuleMSG(BasePointSAModule):
+    """Point set abstraction module with multi-scale grouping (MSG) used in
+    PointNets.
+
+    Args:
+        num_point (int): Number of points.
+        radii (list[float]): List of radius in each ball query.
+        sample_nums (list[int]): Number of samples in each ball query.
+        mlp_channels (list[list[int]]): Specify of the pointnet before
+            the global pooling for each scale.
+        fps_mod (list[str], optional): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
+            F-FPS: using feature distances for FPS.
+            D-FPS: using Euclidean distances of points for FPS.
+            FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (list[int], optional): Range of points to
+            apply FPS. Default: [-1].
+        dilated_group (bool, optional): Whether to use dilated ball query.
+            Default: False.
+        norm_cfg (dict, optional): Type of normalization method.
+            Default: dict(type='BN2d').
+        use_xyz (bool, optional): Whether to use xyz.
+            Default: True.
+        pool_mod (str, optional): Type of pooling method.
+            Default: 'max_pool'.
+        normalize_xyz (bool, optional): Whether to normalize local XYZ
+            with radius. Default: False.
+        bias (bool | str, optional): If specified as `auto`, it will be
+            decided by `norm_cfg`. `bias` will be set as True if
+            `norm_cfg` is None, otherwise False. Default: 'auto'.
+    """
+
+    def __init__(self,
+                 num_point,
+                 radii,
+                 sample_nums,
+                 mlp_channels,
+                 fps_mod=['D-FPS'],
+                 fps_sample_range_list=[-1],
+                 dilated_group=False,
+                 norm_cfg=dict(type='BN2d'),
+                 use_xyz=True,
+                 pool_mod='max',
+                 normalize_xyz=False,
+                 bias='auto'):
+        super(PointSAModuleMSG, self).__init__(
+            num_point=num_point,
+            radii=radii,
+            sample_nums=sample_nums,
+            mlp_channels=mlp_channels,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            dilated_group=dilated_group,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            normalize_xyz=normalize_xyz)
+
+        for i in range(len(self.mlp_channels)):
+            mlp_channel = self.mlp_channels[i]
+            if use_xyz:
+                mlp_channel[0] += 3
+
+            mlp = nn.Sequential()
+            for i in range(len(mlp_channel) - 1):
+                mlp.add_module(
+                    f'layer{i}',
+                    ConvModule(
+                        mlp_channel[i],
+                        mlp_channel[i + 1],
+                        kernel_size=(1, 1),
+                        stride=(1, 1),
+                        conv_cfg=dict(type='Conv2d'),
+                        norm_cfg=norm_cfg,
+                        bias=bias))
+            self.mlps.append(mlp)
+
+
+@SA_MODULES.register_module()
+class PointSAModule(PointSAModuleMSG):
+    """Point set abstraction module with single-scale grouping (SSG) used in
+    PointNets.
+
+    Args:
+        mlp_channels (list[int]): Specify of the pointnet before
+            the global pooling for each scale.
+        num_point (int, optional): Number of points.
+            Default: None.
+        radius (float, optional): Radius to group with.
+            Default: None.
+        num_sample (int, optional): Number of samples in each ball query.
+            Default: None.
+        norm_cfg (dict, optional): Type of normalization method.
+            Default: dict(type='BN2d').
+        use_xyz (bool, optional): Whether to use xyz.
+            Default: True.
+        pool_mod (str, optional): Type of pooling method.
+            Default: 'max_pool'.
+        fps_mod (list[str], optional): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
+        fps_sample_range_list (list[int], optional): Range of points
+            to apply FPS. Default: [-1].
+        normalize_xyz (bool, optional): Whether to normalize local XYZ
+            with radius. Default: False.
+    """
+
+    def __init__(self,
+                 mlp_channels,
+                 num_point=None,
+                 radius=None,
+                 num_sample=None,
+                 norm_cfg=dict(type='BN2d'),
+                 use_xyz=True,
+                 pool_mod='max',
+                 fps_mod=['D-FPS'],
+                 fps_sample_range_list=[-1],
+                 normalize_xyz=False):
+        super(PointSAModule, self).__init__(
+            mlp_channels=[mlp_channels],
+            num_point=num_point,
+            radii=[radius],
+            sample_nums=[num_sample],
+            norm_cfg=norm_cfg,
+            use_xyz=use_xyz,
+            pool_mod=pool_mod,
+            fps_mod=fps_mod,
+            fps_sample_range_list=fps_sample_range_list,
+            normalize_xyz=normalize_xyz)
diff --git a/mmdet3d/ops/sparse_block.py b/mmdet3d/ops/sparse_block.py
index 03b18e2..fc40740 100644
--- a/mmdet3d/ops/sparse_block.py
+++ b/mmdet3d/ops/sparse_block.py
@@ -1,199 +1,199 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.cnn import build_conv_layer, build_norm_layer
-from torch import nn
-
-from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
-from .spconv import IS_SPCONV2_AVAILABLE
-
-if IS_SPCONV2_AVAILABLE:
-    from spconv.pytorch import SparseModule, SparseSequential
-else:
-    from mmcv.ops import SparseModule, SparseSequential
-
-
-def replace_feature(out, new_features):
-    if 'replace_feature' in out.__dir__():
-        # spconv 2.x behaviour
-        return out.replace_feature(new_features)
-    else:
-        out.features = new_features
-        return out
-
-
-class SparseBottleneck(Bottleneck, SparseModule):
-    """Sparse bottleneck block for PartA^2.
-
-    Bottleneck block implemented with submanifold sparse convolution.
-
-    Args:
-        inplanes (int): inplanes of block.
-        planes (int): planes of block.
-        stride (int, optional): stride of the first block. Default: 1.
-        downsample (Module, optional): down sample module for block.
-        conv_cfg (dict, optional): dictionary to construct and config conv
-            layer. Default: None.
-        norm_cfg (dict, optional): dictionary to construct and config norm
-            layer. Default: dict(type='BN').
-    """
-
-    expansion = 4
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 downsample=None,
-                 conv_cfg=None,
-                 norm_cfg=None):
-
-        SparseModule.__init__(self)
-        Bottleneck.__init__(
-            self,
-            inplanes,
-            planes,
-            stride=stride,
-            downsample=downsample,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg)
-
-    def forward(self, x):
-        identity = x.features
-
-        out = self.conv1(x)
-        out = replace_feature(out, self.bn1(out.features))
-        out = replace_feature(out, self.relu(out.features))
-
-        out = self.conv2(out)
-        out = replace_feature(out, self.bn2(out.features))
-        out = replace_feature(out, self.relu(out.features))
-
-        out = self.conv3(out)
-        out = replace_feature(out, self.bn3(out.features))
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out = replace_feature(out, out.features + identity)
-        out = replace_feature(out, self.relu(out.features))
-
-        return out
-
-
-class SparseBasicBlock(BasicBlock, SparseModule):
-    """Sparse basic block for PartA^2.
-
-    Sparse basic block implemented with submanifold sparse convolution.
-
-    Args:
-        inplanes (int): inplanes of block.
-        planes (int): planes of block.
-        stride (int, optional): stride of the first block. Default: 1.
-        downsample (Module, optional): down sample module for block.
-        conv_cfg (dict, optional): dictionary to construct and config conv
-            layer. Default: None.
-        norm_cfg (dict, optional): dictionary to construct and config norm
-            layer. Default: dict(type='BN').
-    """
-
-    expansion = 1
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 downsample=None,
-                 conv_cfg=None,
-                 norm_cfg=None):
-        SparseModule.__init__(self)
-        BasicBlock.__init__(
-            self,
-            inplanes,
-            planes,
-            stride=stride,
-            downsample=downsample,
-            conv_cfg=conv_cfg,
-            norm_cfg=norm_cfg)
-
-    def forward(self, x):
-        identity = x.features
-
-        assert x.features.dim() == 2, f'x.features.dim()={x.features.dim()}'
-        out = self.conv1(x)
-        out = replace_feature(out, self.norm1(out.features))
-        out = replace_feature(out, self.relu(out.features))
-
-        out = self.conv2(out)
-        out = replace_feature(out, self.norm2(out.features))
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out = replace_feature(out, out.features + identity)
-        out = replace_feature(out, self.relu(out.features))
-
-        return out
-
-
-def make_sparse_convmodule(in_channels,
-                           out_channels,
-                           kernel_size,
-                           indice_key,
-                           stride=1,
-                           padding=0,
-                           conv_type='SubMConv3d',
-                           norm_cfg=None,
-                           order=('conv', 'norm', 'act')):
-    """Make sparse convolution module.
-
-    Args:
-        in_channels (int): the number of input channels
-        out_channels (int): the number of out channels
-        kernel_size (int|tuple(int)): kernel size of convolution
-        indice_key (str): the indice key used for sparse tensor
-        stride (int|tuple(int)): the stride of convolution
-        padding (int or list[int]): the padding number of input
-        conv_type (str): sparse conv type in spconv
-        norm_cfg (dict[str]): config of normalization layer
-        order (tuple[str]): The order of conv/norm/activation layers. It is a
-            sequence of "conv", "norm" and "act". Common examples are
-            ("conv", "norm", "act") and ("act", "conv", "norm").
-
-    Returns:
-        spconv.SparseSequential: sparse convolution module.
-    """
-    assert isinstance(order, tuple) and len(order) <= 3
-    assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'}
-
-    conv_cfg = dict(type=conv_type, indice_key=indice_key)
-
-    layers = list()
-    for layer in order:
-        if layer == 'conv':
-            if conv_type not in [
-                    'SparseInverseConv3d', 'SparseInverseConv2d',
-                    'SparseInverseConv1d'
-            ]:
-                layers.append(
-                    build_conv_layer(
-                        conv_cfg,
-                        in_channels,
-                        out_channels,
-                        kernel_size,
-                        stride=stride,
-                        padding=padding,
-                        bias=False))
-            else:
-                layers.append(
-                    build_conv_layer(
-                        conv_cfg,
-                        in_channels,
-                        out_channels,
-                        kernel_size,
-                        bias=False))
-        elif layer == 'norm':
-            layers.append(build_norm_layer(norm_cfg, out_channels)[1])
-        elif layer == 'act':
-            layers.append(nn.ReLU(inplace=True))
-
-    layers = SparseSequential(*layers)
-    return layers
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from torch import nn
+
+from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
+from .spconv import IS_SPCONV2_AVAILABLE
+
+if IS_SPCONV2_AVAILABLE:
+    from spconv.pytorch import SparseModule, SparseSequential
+else:
+    from mmcv.ops import SparseModule, SparseSequential
+
+
+def replace_feature(out, new_features):
+    if 'replace_feature' in out.__dir__():
+        # spconv 2.x behaviour
+        return out.replace_feature(new_features)
+    else:
+        out.features = new_features
+        return out
+
+
+class SparseBottleneck(Bottleneck, SparseModule):
+    """Sparse bottleneck block for PartA^2.
+
+    Bottleneck block implemented with submanifold sparse convolution.
+
+    Args:
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        stride (int, optional): stride of the first block. Default: 1.
+        downsample (Module, optional): down sample module for block.
+        conv_cfg (dict, optional): dictionary to construct and config conv
+            layer. Default: None.
+        norm_cfg (dict, optional): dictionary to construct and config norm
+            layer. Default: dict(type='BN').
+    """
+
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 conv_cfg=None,
+                 norm_cfg=None):
+
+        SparseModule.__init__(self)
+        Bottleneck.__init__(
+            self,
+            inplanes,
+            planes,
+            stride=stride,
+            downsample=downsample,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        identity = x.features
+
+        out = self.conv1(x)
+        out = replace_feature(out, self.bn1(out.features))
+        out = replace_feature(out, self.relu(out.features))
+
+        out = self.conv2(out)
+        out = replace_feature(out, self.bn2(out.features))
+        out = replace_feature(out, self.relu(out.features))
+
+        out = self.conv3(out)
+        out = replace_feature(out, self.bn3(out.features))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out = replace_feature(out, out.features + identity)
+        out = replace_feature(out, self.relu(out.features))
+
+        return out
+
+
+class SparseBasicBlock(BasicBlock, SparseModule):
+    """Sparse basic block for PartA^2.
+
+    Sparse basic block implemented with submanifold sparse convolution.
+
+    Args:
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        stride (int, optional): stride of the first block. Default: 1.
+        downsample (Module, optional): down sample module for block.
+        conv_cfg (dict, optional): dictionary to construct and config conv
+            layer. Default: None.
+        norm_cfg (dict, optional): dictionary to construct and config norm
+            layer. Default: dict(type='BN').
+    """
+
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 conv_cfg=None,
+                 norm_cfg=None):
+        SparseModule.__init__(self)
+        BasicBlock.__init__(
+            self,
+            inplanes,
+            planes,
+            stride=stride,
+            downsample=downsample,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        identity = x.features
+
+        assert x.features.dim() == 2, f'x.features.dim()={x.features.dim()}'
+        out = self.conv1(x)
+        out = replace_feature(out, self.norm1(out.features))
+        out = replace_feature(out, self.relu(out.features))
+
+        out = self.conv2(out)
+        out = replace_feature(out, self.norm2(out.features))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out = replace_feature(out, out.features + identity)
+        out = replace_feature(out, self.relu(out.features))
+
+        return out
+
+
+def make_sparse_convmodule(in_channels,
+                           out_channels,
+                           kernel_size,
+                           indice_key,
+                           stride=1,
+                           padding=0,
+                           conv_type='SubMConv3d',
+                           norm_cfg=None,
+                           order=('conv', 'norm', 'act')):
+    """Make sparse convolution module.
+
+    Args:
+        in_channels (int): the number of input channels
+        out_channels (int): the number of out channels
+        kernel_size (int|tuple(int)): kernel size of convolution
+        indice_key (str): the indice key used for sparse tensor
+        stride (int|tuple(int)): the stride of convolution
+        padding (int or list[int]): the padding number of input
+        conv_type (str): sparse conv type in spconv
+        norm_cfg (dict[str]): config of normalization layer
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+
+    Returns:
+        spconv.SparseSequential: sparse convolution module.
+    """
+    assert isinstance(order, tuple) and len(order) <= 3
+    assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'}
+
+    conv_cfg = dict(type=conv_type, indice_key=indice_key)
+
+    layers = list()
+    for layer in order:
+        if layer == 'conv':
+            if conv_type not in [
+                    'SparseInverseConv3d', 'SparseInverseConv2d',
+                    'SparseInverseConv1d'
+            ]:
+                layers.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        in_channels,
+                        out_channels,
+                        kernel_size,
+                        stride=stride,
+                        padding=padding,
+                        bias=False))
+            else:
+                layers.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        in_channels,
+                        out_channels,
+                        kernel_size,
+                        bias=False))
+        elif layer == 'norm':
+            layers.append(build_norm_layer(norm_cfg, out_channels)[1])
+        elif layer == 'act':
+            layers.append(nn.ReLU(inplace=True))
+
+    layers = SparseSequential(*layers)
+    return layers
diff --git a/mmdet3d/ops/spconv/__init__.py b/mmdet3d/ops/spconv/__init__.py
index 561e502..5a8e789 100644
--- a/mmdet3d/ops/spconv/__init__.py
+++ b/mmdet3d/ops/spconv/__init__.py
@@ -1,14 +1,14 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .overwrite_spconv.write_spconv2 import register_spconv2
-
-try:
-    import spconv
-except ImportError:
-    IS_SPCONV2_AVAILABLE = False
-else:
-    if hasattr(spconv, '__version__') and spconv.__version__ >= '2.0.0':
-        IS_SPCONV2_AVAILABLE = register_spconv2()
-    else:
-        IS_SPCONV2_AVAILABLE = False
-
-__all__ = ['IS_SPCONV2_AVAILABLE']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .overwrite_spconv.write_spconv2 import register_spconv2
+
+try:
+    import spconv
+except ImportError:
+    IS_SPCONV2_AVAILABLE = False
+else:
+    if hasattr(spconv, '__version__') and spconv.__version__ >= '2.0.0':
+        IS_SPCONV2_AVAILABLE = register_spconv2()
+    else:
+        IS_SPCONV2_AVAILABLE = False
+
+__all__ = ['IS_SPCONV2_AVAILABLE']
diff --git a/mmdet3d/ops/spconv/overwrite_spconv/__init__.py b/mmdet3d/ops/spconv/overwrite_spconv/__init__.py
index 2e93d9c..a0eabe4 100644
--- a/mmdet3d/ops/spconv/overwrite_spconv/__init__.py
+++ b/mmdet3d/ops/spconv/overwrite_spconv/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .write_spconv2 import register_spconv2
-
-__all__ = ['register_spconv2']
+# Copyright (c) OpenMMLab. All rights reserved.
+from .write_spconv2 import register_spconv2
+
+__all__ = ['register_spconv2']
diff --git a/mmdet3d/ops/spconv/overwrite_spconv/write_spconv2.py b/mmdet3d/ops/spconv/overwrite_spconv/write_spconv2.py
index 237051e..37e98d2 100644
--- a/mmdet3d/ops/spconv/overwrite_spconv/write_spconv2.py
+++ b/mmdet3d/ops/spconv/overwrite_spconv/write_spconv2.py
@@ -1,118 +1,118 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import itertools
-
-from mmcv.cnn.bricks.registry import CONV_LAYERS
-from torch.nn.parameter import Parameter
-
-
-def register_spconv2():
-    """This func registers spconv2.0 spconv ops to overwrite the default mmcv
-    spconv ops."""
-    try:
-        from spconv.pytorch import (SparseConv2d, SparseConv3d, SparseConv4d,
-                                    SparseConvTranspose2d,
-                                    SparseConvTranspose3d, SparseInverseConv2d,
-                                    SparseInverseConv3d, SparseModule,
-                                    SubMConv2d, SubMConv3d, SubMConv4d)
-    except ImportError:
-        return False
-    else:
-        CONV_LAYERS._register_module(SparseConv2d, 'SparseConv2d', force=True)
-        CONV_LAYERS._register_module(SparseConv3d, 'SparseConv3d', force=True)
-        CONV_LAYERS._register_module(SparseConv4d, 'SparseConv4d', force=True)
-
-        CONV_LAYERS._register_module(
-            SparseConvTranspose2d, 'SparseConvTranspose2d', force=True)
-        CONV_LAYERS._register_module(
-            SparseConvTranspose3d, 'SparseConvTranspose3d', force=True)
-
-        CONV_LAYERS._register_module(
-            SparseInverseConv2d, 'SparseInverseConv2d', force=True)
-        CONV_LAYERS._register_module(
-            SparseInverseConv3d, 'SparseInverseConv3d', force=True)
-
-        CONV_LAYERS._register_module(SubMConv2d, 'SubMConv2d', force=True)
-        CONV_LAYERS._register_module(SubMConv3d, 'SubMConv3d', force=True)
-        CONV_LAYERS._register_module(SubMConv4d, 'SubMConv4d', force=True)
-        SparseModule._load_from_state_dict = _load_from_state_dict
-        SparseModule._save_to_state_dict = _save_to_state_dict
-        return True
-
-
-def _save_to_state_dict(self, destination, prefix, keep_vars):
-    """Rewrite this func to compat the convolutional kernel weights between
-    spconv 1.x in MMCV and 2.x in spconv2.x.
-
-    Kernel weights in MMCV spconv has shape in (D,H,W,in_channel,out_channel) ,
-    while those in spcon2.x is in (out_channel,D,H,W,in_channel).
-    """
-    for name, param in self._parameters.items():
-        if param is not None:
-            param = param if keep_vars else param.detach()
-            if name == 'weight':
-                dims = list(range(1, len(param.shape))) + [0]
-                param = param.permute(*dims)
-            destination[prefix + name] = param
-    for name, buf in self._buffers.items():
-        if buf is not None and name not in self._non_persistent_buffers_set:
-            destination[prefix + name] = buf if keep_vars else buf.detach()
-
-
-def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                          missing_keys, unexpected_keys, error_msgs):
-    """Rewrite this func to compat the convolutional kernel weights between
-    spconv 1.x in MMCV and 2.x in spconv2.x.
-
-    Kernel weights in MMCV spconv has shape in (D,H,W,in_channel,out_channel) ,
-    while those in spcon2.x is in (out_channel,D,H,W,in_channel).
-    """
-    for hook in self._load_state_dict_pre_hooks.values():
-        hook(state_dict, prefix, local_metadata, strict, missing_keys,
-             unexpected_keys, error_msgs)
-
-    local_name_params = itertools.chain(self._parameters.items(),
-                                        self._buffers.items())
-    local_state = {k: v.data for k, v in local_name_params if v is not None}
-
-    for name, param in local_state.items():
-        key = prefix + name
-        if key in state_dict:
-            input_param = state_dict[key]
-
-            # Backward compatibility: loading 1-dim tensor from
-            # 0.3.* to version 0.4+
-            if len(param.shape) == 0 and len(input_param.shape) == 1:
-                input_param = input_param[0]
-            dims = [len(input_param.shape) - 1] + list(
-                range(len(input_param.shape) - 1))
-            input_param = input_param.permute(*dims)
-            if input_param.shape != param.shape:
-                # local shape should match the one in checkpoint
-                error_msgs.append(
-                    f'size mismatch for {key}: copying a param with '
-                    f'shape {key, input_param.shape} from checkpoint,'
-                    f'the shape in current model is {param.shape}.')
-                continue
-
-            if isinstance(input_param, Parameter):
-                # backwards compatibility for serialized parameters
-                input_param = input_param.data
-            try:
-                param.copy_(input_param)
-            except Exception:
-                error_msgs.append(
-                    f'While copying the parameter named "{key}", whose '
-                    f'dimensions in the model are {param.size()} and whose '
-                    f'dimensions in the checkpoint are {input_param.size()}.')
-        elif strict:
-            missing_keys.append(key)
-
-    if strict:
-        for key, input_param in state_dict.items():
-            if key.startswith(prefix):
-                input_name = key[len(prefix):]
-                input_name = input_name.split(
-                    '.', 1)[0]  # get the name of param/buffer/child
-                if input_name not in self._modules \
-                        and input_name not in local_state:
-                    unexpected_keys.append(key)
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+
+from mmcv.cnn.bricks.registry import CONV_LAYERS
+from torch.nn.parameter import Parameter
+
+
+def register_spconv2():
+    """This func registers spconv2.0 spconv ops to overwrite the default mmcv
+    spconv ops."""
+    try:
+        from spconv.pytorch import (SparseConv2d, SparseConv3d, SparseConv4d,
+                                    SparseConvTranspose2d,
+                                    SparseConvTranspose3d, SparseInverseConv2d,
+                                    SparseInverseConv3d, SparseModule,
+                                    SubMConv2d, SubMConv3d, SubMConv4d)
+    except ImportError:
+        return False
+    else:
+        CONV_LAYERS._register_module(SparseConv2d, 'SparseConv2d', force=True)
+        CONV_LAYERS._register_module(SparseConv3d, 'SparseConv3d', force=True)
+        CONV_LAYERS._register_module(SparseConv4d, 'SparseConv4d', force=True)
+
+        CONV_LAYERS._register_module(
+            SparseConvTranspose2d, 'SparseConvTranspose2d', force=True)
+        CONV_LAYERS._register_module(
+            SparseConvTranspose3d, 'SparseConvTranspose3d', force=True)
+
+        CONV_LAYERS._register_module(
+            SparseInverseConv2d, 'SparseInverseConv2d', force=True)
+        CONV_LAYERS._register_module(
+            SparseInverseConv3d, 'SparseInverseConv3d', force=True)
+
+        CONV_LAYERS._register_module(SubMConv2d, 'SubMConv2d', force=True)
+        CONV_LAYERS._register_module(SubMConv3d, 'SubMConv3d', force=True)
+        CONV_LAYERS._register_module(SubMConv4d, 'SubMConv4d', force=True)
+        SparseModule._load_from_state_dict = _load_from_state_dict
+        SparseModule._save_to_state_dict = _save_to_state_dict
+        return True
+
+
+def _save_to_state_dict(self, destination, prefix, keep_vars):
+    """Rewrite this func to compat the convolutional kernel weights between
+    spconv 1.x in MMCV and 2.x in spconv2.x.
+
+    Kernel weights in MMCV spconv has shape in (D,H,W,in_channel,out_channel) ,
+    while those in spcon2.x is in (out_channel,D,H,W,in_channel).
+    """
+    for name, param in self._parameters.items():
+        if param is not None:
+            param = param if keep_vars else param.detach()
+            if name == 'weight':
+                dims = list(range(1, len(param.shape))) + [0]
+                param = param.permute(*dims)
+            destination[prefix + name] = param
+    for name, buf in self._buffers.items():
+        if buf is not None and name not in self._non_persistent_buffers_set:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                          missing_keys, unexpected_keys, error_msgs):
+    """Rewrite this func to compat the convolutional kernel weights between
+    spconv 1.x in MMCV and 2.x in spconv2.x.
+
+    Kernel weights in MMCV spconv has shape in (D,H,W,in_channel,out_channel) ,
+    while those in spcon2.x is in (out_channel,D,H,W,in_channel).
+    """
+    for hook in self._load_state_dict_pre_hooks.values():
+        hook(state_dict, prefix, local_metadata, strict, missing_keys,
+             unexpected_keys, error_msgs)
+
+    local_name_params = itertools.chain(self._parameters.items(),
+                                        self._buffers.items())
+    local_state = {k: v.data for k, v in local_name_params if v is not None}
+
+    for name, param in local_state.items():
+        key = prefix + name
+        if key in state_dict:
+            input_param = state_dict[key]
+
+            # Backward compatibility: loading 1-dim tensor from
+            # 0.3.* to version 0.4+
+            if len(param.shape) == 0 and len(input_param.shape) == 1:
+                input_param = input_param[0]
+            dims = [len(input_param.shape) - 1] + list(
+                range(len(input_param.shape) - 1))
+            input_param = input_param.permute(*dims)
+            if input_param.shape != param.shape:
+                # local shape should match the one in checkpoint
+                error_msgs.append(
+                    f'size mismatch for {key}: copying a param with '
+                    f'shape {key, input_param.shape} from checkpoint,'
+                    f'the shape in current model is {param.shape}.')
+                continue
+
+            if isinstance(input_param, Parameter):
+                # backwards compatibility for serialized parameters
+                input_param = input_param.data
+            try:
+                param.copy_(input_param)
+            except Exception:
+                error_msgs.append(
+                    f'While copying the parameter named "{key}", whose '
+                    f'dimensions in the model are {param.size()} and whose '
+                    f'dimensions in the checkpoint are {input_param.size()}.')
+        elif strict:
+            missing_keys.append(key)
+
+    if strict:
+        for key, input_param in state_dict.items():
+            if key.startswith(prefix):
+                input_name = key[len(prefix):]
+                input_name = input_name.split(
+                    '.', 1)[0]  # get the name of param/buffer/child
+                if input_name not in self._modules \
+                        and input_name not in local_state:
+                    unexpected_keys.append(key)
diff --git a/mmdet3d/utils/__init__.py b/mmdet3d/utils/__init__.py
index ad59961..cc4ab19 100644
--- a/mmdet3d/utils/__init__.py
+++ b/mmdet3d/utils/__init__.py
@@ -1,14 +1,14 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.utils import Registry, build_from_cfg, print_log
-
-from .collect_env import collect_env
-from .compat_cfg import compat_cfg
-from .logger import get_root_logger
-from .misc import find_latest_checkpoint
-from .setup_env import setup_multi_processes
-
-__all__ = [
-    'Registry', 'build_from_cfg', 'get_root_logger', 'collect_env',
-    'print_log', 'setup_multi_processes', 'find_latest_checkpoint',
-    'compat_cfg'
-]
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, build_from_cfg, print_log
+
+from .collect_env import collect_env
+from .compat_cfg import compat_cfg
+from .logger import get_root_logger
+from .misc import find_latest_checkpoint
+from .setup_env import setup_multi_processes
+
+__all__ = [
+    'Registry', 'build_from_cfg', 'get_root_logger', 'collect_env',
+    'print_log', 'setup_multi_processes', 'find_latest_checkpoint',
+    'compat_cfg'
+]
diff --git a/mmdet3d/utils/collect_env.py b/mmdet3d/utils/collect_env.py
index 1131f12..952fc32 100644
--- a/mmdet3d/utils/collect_env.py
+++ b/mmdet3d/utils/collect_env.py
@@ -1,23 +1,23 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.utils import collect_env as collect_base_env
-from mmcv.utils import get_git_hash
-
-import mmdet
-import mmdet3d
-import mmseg
-from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE
-
-
-def collect_env():
-    """Collect the information of the running environments."""
-    env_info = collect_base_env()
-    env_info['MMDetection'] = mmdet.__version__
-    env_info['MMSegmentation'] = mmseg.__version__
-    env_info['MMDetection3D'] = mmdet3d.__version__ + '+' + get_git_hash()[:7]
-    env_info['spconv2.0'] = IS_SPCONV2_AVAILABLE
-    return env_info
-
-
-if __name__ == '__main__':
-    for name, val in collect_env().items():
-        print(f'{name}: {val}')
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import collect_env as collect_base_env
+from mmcv.utils import get_git_hash
+
+import mmdet
+import mmdet3d
+import mmseg
+from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMDetection'] = mmdet.__version__
+    env_info['MMSegmentation'] = mmseg.__version__
+    env_info['MMDetection3D'] = mmdet3d.__version__ + '+' + get_git_hash()[:7]
+    env_info['spconv2.0'] = IS_SPCONV2_AVAILABLE
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/mmdet3d/utils/compat_cfg.py b/mmdet3d/utils/compat_cfg.py
index 05aa37d..c475504 100644
--- a/mmdet3d/utils/compat_cfg.py
+++ b/mmdet3d/utils/compat_cfg.py
@@ -1,139 +1,139 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import warnings
-
-from mmcv import ConfigDict
-
-
-def compat_cfg(cfg):
-    """This function would modify some filed to keep the compatibility of
-    config.
-
-    For example, it will move some args which will be deprecated to the correct
-    fields.
-    """
-    cfg = copy.deepcopy(cfg)
-    cfg = compat_imgs_per_gpu(cfg)
-    cfg = compat_loader_args(cfg)
-    cfg = compat_runner_args(cfg)
-    return cfg
-
-
-def compat_runner_args(cfg):
-    if 'runner' not in cfg:
-        cfg.runner = ConfigDict({
-            'type': 'EpochBasedRunner',
-            'max_epochs': cfg.total_epochs
-        })
-        warnings.warn(
-            'config is now expected to have a `runner` section, '
-            'please set `runner` in your config.', UserWarning)
-    else:
-        if 'total_epochs' in cfg:
-            assert cfg.total_epochs == cfg.runner.max_epochs
-    return cfg
-
-
-def compat_imgs_per_gpu(cfg):
-    cfg = copy.deepcopy(cfg)
-    if 'imgs_per_gpu' in cfg.data:
-        warnings.warn('"imgs_per_gpu" is deprecated in MMDet V2.0. '
-                      'Please use "samples_per_gpu" instead')
-        if 'samples_per_gpu' in cfg.data:
-            warnings.warn(
-                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
-                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
-                f'={cfg.data.imgs_per_gpu} is used in this experiments')
-        else:
-            warnings.warn('Automatically set "samples_per_gpu"="imgs_per_gpu"='
-                          f'{cfg.data.imgs_per_gpu} in this experiments')
-        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
-    return cfg
-
-
-def compat_loader_args(cfg):
-    """Deprecated sample_per_gpu in cfg.data."""
-
-    cfg = copy.deepcopy(cfg)
-    if 'train_dataloader' not in cfg.data:
-        cfg.data['train_dataloader'] = ConfigDict()
-    if 'val_dataloader' not in cfg.data:
-        cfg.data['val_dataloader'] = ConfigDict()
-    if 'test_dataloader' not in cfg.data:
-        cfg.data['test_dataloader'] = ConfigDict()
-
-    # special process for train_dataloader
-    if 'samples_per_gpu' in cfg.data:
-
-        samples_per_gpu = cfg.data.pop('samples_per_gpu')
-        assert 'samples_per_gpu' not in \
-               cfg.data.train_dataloader, ('`samples_per_gpu` are set '
-                                           'in `data` field and ` '
-                                           'data.train_dataloader` '
-                                           'at the same time. '
-                                           'Please only set it in '
-                                           '`data.train_dataloader`. ')
-        cfg.data.train_dataloader['samples_per_gpu'] = samples_per_gpu
-
-    if 'persistent_workers' in cfg.data:
-
-        persistent_workers = cfg.data.pop('persistent_workers')
-        assert 'persistent_workers' not in \
-               cfg.data.train_dataloader, ('`persistent_workers` are set '
-                                           'in `data` field and ` '
-                                           'data.train_dataloader` '
-                                           'at the same time. '
-                                           'Please only set it in '
-                                           '`data.train_dataloader`. ')
-        cfg.data.train_dataloader['persistent_workers'] = persistent_workers
-
-    if 'workers_per_gpu' in cfg.data:
-
-        workers_per_gpu = cfg.data.pop('workers_per_gpu')
-        cfg.data.train_dataloader['workers_per_gpu'] = workers_per_gpu
-        cfg.data.val_dataloader['workers_per_gpu'] = workers_per_gpu
-        cfg.data.test_dataloader['workers_per_gpu'] = workers_per_gpu
-
-    # special process for val_dataloader
-    if 'samples_per_gpu' in cfg.data.val:
-        # keep default value of `sample_per_gpu` is 1
-        assert 'samples_per_gpu' not in \
-               cfg.data.val_dataloader, ('`samples_per_gpu` are set '
-                                         'in `data.val` field and ` '
-                                         'data.val_dataloader` at '
-                                         'the same time. '
-                                         'Please only set it in '
-                                         '`data.val_dataloader`. ')
-        cfg.data.val_dataloader['samples_per_gpu'] = \
-            cfg.data.val.pop('samples_per_gpu')
-    # special process for val_dataloader
-
-    # in case the test dataset is concatenated
-    if isinstance(cfg.data.test, dict):
-        if 'samples_per_gpu' in cfg.data.test:
-            assert 'samples_per_gpu' not in \
-                   cfg.data.test_dataloader, ('`samples_per_gpu` are set '
-                                              'in `data.test` field and ` '
-                                              'data.test_dataloader` '
-                                              'at the same time. '
-                                              'Please only set it in '
-                                              '`data.test_dataloader`. ')
-
-            cfg.data.test_dataloader['samples_per_gpu'] = \
-                cfg.data.test.pop('samples_per_gpu')
-
-    elif isinstance(cfg.data.test, list):
-        for ds_cfg in cfg.data.test:
-            if 'samples_per_gpu' in ds_cfg:
-                assert 'samples_per_gpu' not in \
-                       cfg.data.test_dataloader, ('`samples_per_gpu` are set '
-                                                  'in `data.test` field and ` '
-                                                  'data.test_dataloader` at'
-                                                  ' the same time. '
-                                                  'Please only set it in '
-                                                  '`data.test_dataloader`. ')
-        samples_per_gpu = max(
-            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
-        cfg.data.test_dataloader['samples_per_gpu'] = samples_per_gpu
-
-    return cfg
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+from mmcv import ConfigDict
+
+
+def compat_cfg(cfg):
+    """This function would modify some filed to keep the compatibility of
+    config.
+
+    For example, it will move some args which will be deprecated to the correct
+    fields.
+    """
+    cfg = copy.deepcopy(cfg)
+    cfg = compat_imgs_per_gpu(cfg)
+    cfg = compat_loader_args(cfg)
+    cfg = compat_runner_args(cfg)
+    return cfg
+
+
+def compat_runner_args(cfg):
+    if 'runner' not in cfg:
+        cfg.runner = ConfigDict({
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        })
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    return cfg
+
+
+def compat_imgs_per_gpu(cfg):
+    cfg = copy.deepcopy(cfg)
+    if 'imgs_per_gpu' in cfg.data:
+        warnings.warn('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                      'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            warnings.warn(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            warnings.warn('Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                          f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+    return cfg
+
+
+def compat_loader_args(cfg):
+    """Deprecated sample_per_gpu in cfg.data."""
+
+    cfg = copy.deepcopy(cfg)
+    if 'train_dataloader' not in cfg.data:
+        cfg.data['train_dataloader'] = ConfigDict()
+    if 'val_dataloader' not in cfg.data:
+        cfg.data['val_dataloader'] = ConfigDict()
+    if 'test_dataloader' not in cfg.data:
+        cfg.data['test_dataloader'] = ConfigDict()
+
+    # special process for train_dataloader
+    if 'samples_per_gpu' in cfg.data:
+
+        samples_per_gpu = cfg.data.pop('samples_per_gpu')
+        assert 'samples_per_gpu' not in \
+               cfg.data.train_dataloader, ('`samples_per_gpu` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    if 'persistent_workers' in cfg.data:
+
+        persistent_workers = cfg.data.pop('persistent_workers')
+        assert 'persistent_workers' not in \
+               cfg.data.train_dataloader, ('`persistent_workers` are set '
+                                           'in `data` field and ` '
+                                           'data.train_dataloader` '
+                                           'at the same time. '
+                                           'Please only set it in '
+                                           '`data.train_dataloader`. ')
+        cfg.data.train_dataloader['persistent_workers'] = persistent_workers
+
+    if 'workers_per_gpu' in cfg.data:
+
+        workers_per_gpu = cfg.data.pop('workers_per_gpu')
+        cfg.data.train_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.val_dataloader['workers_per_gpu'] = workers_per_gpu
+        cfg.data.test_dataloader['workers_per_gpu'] = workers_per_gpu
+
+    # special process for val_dataloader
+    if 'samples_per_gpu' in cfg.data.val:
+        # keep default value of `sample_per_gpu` is 1
+        assert 'samples_per_gpu' not in \
+               cfg.data.val_dataloader, ('`samples_per_gpu` are set '
+                                         'in `data.val` field and ` '
+                                         'data.val_dataloader` at '
+                                         'the same time. '
+                                         'Please only set it in '
+                                         '`data.val_dataloader`. ')
+        cfg.data.val_dataloader['samples_per_gpu'] = \
+            cfg.data.val.pop('samples_per_gpu')
+    # special process for val_dataloader
+
+    # in case the test dataset is concatenated
+    if isinstance(cfg.data.test, dict):
+        if 'samples_per_gpu' in cfg.data.test:
+            assert 'samples_per_gpu' not in \
+                   cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                              'in `data.test` field and ` '
+                                              'data.test_dataloader` '
+                                              'at the same time. '
+                                              'Please only set it in '
+                                              '`data.test_dataloader`. ')
+
+            cfg.data.test_dataloader['samples_per_gpu'] = \
+                cfg.data.test.pop('samples_per_gpu')
+
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            if 'samples_per_gpu' in ds_cfg:
+                assert 'samples_per_gpu' not in \
+                       cfg.data.test_dataloader, ('`samples_per_gpu` are set '
+                                                  'in `data.test` field and ` '
+                                                  'data.test_dataloader` at'
+                                                  ' the same time. '
+                                                  'Please only set it in '
+                                                  '`data.test_dataloader`. ')
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        cfg.data.test_dataloader['samples_per_gpu'] = samples_per_gpu
+
+    return cfg
diff --git a/mmdet3d/utils/logger.py b/mmdet3d/utils/logger.py
index 14295d1..ac94d56 100644
--- a/mmdet3d/utils/logger.py
+++ b/mmdet3d/utils/logger.py
@@ -1,31 +1,31 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-
-from mmcv.utils import get_logger
-
-
-def get_root_logger(log_file=None, log_level=logging.INFO, name='mmdet3d'):
-    """Get root logger and add a keyword filter to it.
-
-    The logger will be initialized if it has not been initialized. By default a
-    StreamHandler will be added. If `log_file` is specified, a FileHandler will
-    also be added. The name of the root logger is the top-level package name,
-    e.g., "mmdet3d".
-
-    Args:
-        log_file (str, optional): File path of log. Defaults to None.
-        log_level (int, optional): The level of logger.
-            Defaults to logging.INFO.
-        name (str, optional): The name of the root logger, also used as a
-            filter keyword. Defaults to 'mmdet3d'.
-
-    Returns:
-        :obj:`logging.Logger`: The obtained logger
-    """
-    logger = get_logger(name=name, log_file=log_file, log_level=log_level)
-
-    # add a logging filter
-    logging_filter = logging.Filter(name)
-    logging_filter.filter = lambda record: record.find(name) != -1
-
-    return logger
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+from mmcv.utils import get_logger
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO, name='mmdet3d'):
+    """Get root logger and add a keyword filter to it.
+
+    The logger will be initialized if it has not been initialized. By default a
+    StreamHandler will be added. If `log_file` is specified, a FileHandler will
+    also be added. The name of the root logger is the top-level package name,
+    e.g., "mmdet3d".
+
+    Args:
+        log_file (str, optional): File path of log. Defaults to None.
+        log_level (int, optional): The level of logger.
+            Defaults to logging.INFO.
+        name (str, optional): The name of the root logger, also used as a
+            filter keyword. Defaults to 'mmdet3d'.
+
+    Returns:
+        :obj:`logging.Logger`: The obtained logger
+    """
+    logger = get_logger(name=name, log_file=log_file, log_level=log_level)
+
+    # add a logging filter
+    logging_filter = logging.Filter(name)
+    logging_filter.filter = lambda record: record.find(name) != -1
+
+    return logger
diff --git a/mmdet3d/utils/misc.py b/mmdet3d/utils/misc.py
index 08af048..880f6c1 100644
--- a/mmdet3d/utils/misc.py
+++ b/mmdet3d/utils/misc.py
@@ -1,39 +1,39 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import glob
-import os.path as osp
-import warnings
-
-
-def find_latest_checkpoint(path, suffix='pth'):
-    """Find the latest checkpoint from the working directory. This function is
-    copied from mmdetection.
-
-    Args:
-        path(str): The path to find checkpoints.
-        suffix(str): File extension.
-            Defaults to pth.
-
-    Returns:
-        latest_path(str | None): File path of the latest checkpoint.
-    References:
-        .. [1] https://github.com/microsoft/SoftTeacher
-                  /blob/main/ssod/utils/patch.py
-    """
-    if not osp.exists(path):
-        warnings.warn('The path of checkpoints does not exist.')
-        return None
-    if osp.exists(osp.join(path, f'latest.{suffix}')):
-        return osp.join(path, f'latest.{suffix}')
-
-    checkpoints = glob.glob(osp.join(path, f'*.{suffix}'))
-    if len(checkpoints) == 0:
-        warnings.warn('There are no checkpoints in the path.')
-        return None
-    latest = -1
-    latest_path = None
-    for checkpoint in checkpoints:
-        count = int(osp.basename(checkpoint).split('_')[-1].split('.')[0])
-        if count > latest:
-            latest = count
-            latest_path = checkpoint
-    return latest_path
+# Copyright (c) OpenMMLab. All rights reserved.
+import glob
+import os.path as osp
+import warnings
+
+
+def find_latest_checkpoint(path, suffix='pth'):
+    """Find the latest checkpoint from the working directory. This function is
+    copied from mmdetection.
+
+    Args:
+        path(str): The path to find checkpoints.
+        suffix(str): File extension.
+            Defaults to pth.
+
+    Returns:
+        latest_path(str | None): File path of the latest checkpoint.
+    References:
+        .. [1] https://github.com/microsoft/SoftTeacher
+                  /blob/main/ssod/utils/patch.py
+    """
+    if not osp.exists(path):
+        warnings.warn('The path of checkpoints does not exist.')
+        return None
+    if osp.exists(osp.join(path, f'latest.{suffix}')):
+        return osp.join(path, f'latest.{suffix}')
+
+    checkpoints = glob.glob(osp.join(path, f'*.{suffix}'))
+    if len(checkpoints) == 0:
+        warnings.warn('There are no checkpoints in the path.')
+        return None
+    latest = -1
+    latest_path = None
+    for checkpoint in checkpoints:
+        count = int(osp.basename(checkpoint).split('_')[-1].split('.')[0])
+        if count > latest:
+            latest = count
+            latest_path = checkpoint
+    return latest_path
diff --git a/mmdet3d/utils/setup_env.py b/mmdet3d/utils/setup_env.py
index 8812cb7..ca4193c 100644
--- a/mmdet3d/utils/setup_env.py
+++ b/mmdet3d/utils/setup_env.py
@@ -1,53 +1,53 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import platform
-import warnings
-
-import cv2
-from torch import multiprocessing as mp
-
-
-def setup_multi_processes(cfg):
-    """Setup multi-processing environment variables."""
-    # set multi-process start method as `fork` to speed up the training
-    if platform.system() != 'Windows':
-        mp_start_method = cfg.get('mp_start_method', 'fork')
-        current_method = mp.get_start_method(allow_none=True)
-        if current_method is not None and current_method != mp_start_method:
-            warnings.warn(
-                f'Multi-processing start method `{mp_start_method}` is '
-                f'different from the previous setting `{current_method}`.'
-                f'It will be force set to `{mp_start_method}`. You can change '
-                f'this behavior by changing `mp_start_method` in your config.')
-        mp.set_start_method(mp_start_method, force=True)
-
-    # disable opencv multithreading to avoid system being overloaded
-    opencv_num_threads = cfg.get('opencv_num_threads', 0)
-    cv2.setNumThreads(opencv_num_threads)
-
-    # setup OMP threads
-    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
-    workers_per_gpu = cfg.data.get('workers_per_gpu', 1)
-    if 'train_dataloader' in cfg.data:
-        workers_per_gpu = \
-            max(cfg.data.train_dataloader.get('workers_per_gpu', 1),
-                workers_per_gpu)
-
-    if 'OMP_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
-        omp_num_threads = 1
-        warnings.warn(
-            f'Setting OMP_NUM_THREADS environment variable for each process '
-            f'to be {omp_num_threads} in default, to avoid your system being '
-            f'overloaded, please further tune the variable for optimal '
-            f'performance in your application as needed.')
-        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
-
-    # setup MKL threads
-    if 'MKL_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
-        mkl_num_threads = 1
-        warnings.warn(
-            f'Setting MKL_NUM_THREADS environment variable for each process '
-            f'to be {mkl_num_threads} in default, to avoid your system being '
-            f'overloaded, please further tune the variable for optimal '
-            f'performance in your application as needed.')
-        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+import warnings
+
+import cv2
+from torch import multiprocessing as mp
+
+
+def setup_multi_processes(cfg):
+    """Setup multi-processing environment variables."""
+    # set multi-process start method as `fork` to speed up the training
+    if platform.system() != 'Windows':
+        mp_start_method = cfg.get('mp_start_method', 'fork')
+        current_method = mp.get_start_method(allow_none=True)
+        if current_method is not None and current_method != mp_start_method:
+            warnings.warn(
+                f'Multi-processing start method `{mp_start_method}` is '
+                f'different from the previous setting `{current_method}`.'
+                f'It will be force set to `{mp_start_method}`. You can change '
+                f'this behavior by changing `mp_start_method` in your config.')
+        mp.set_start_method(mp_start_method, force=True)
+
+    # disable opencv multithreading to avoid system being overloaded
+    opencv_num_threads = cfg.get('opencv_num_threads', 0)
+    cv2.setNumThreads(opencv_num_threads)
+
+    # setup OMP threads
+    # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py  # noqa
+    workers_per_gpu = cfg.data.get('workers_per_gpu', 1)
+    if 'train_dataloader' in cfg.data:
+        workers_per_gpu = \
+            max(cfg.data.train_dataloader.get('workers_per_gpu', 1),
+                workers_per_gpu)
+
+    if 'OMP_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        omp_num_threads = 1
+        warnings.warn(
+            f'Setting OMP_NUM_THREADS environment variable for each process '
+            f'to be {omp_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['OMP_NUM_THREADS'] = str(omp_num_threads)
+
+    # setup MKL threads
+    if 'MKL_NUM_THREADS' not in os.environ and workers_per_gpu > 1:
+        mkl_num_threads = 1
+        warnings.warn(
+            f'Setting MKL_NUM_THREADS environment variable for each process '
+            f'to be {mkl_num_threads} in default, to avoid your system being '
+            f'overloaded, please further tune the variable for optimal '
+            f'performance in your application as needed.')
+        os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads)
diff --git a/mmdet3d/version.py b/mmdet3d/version.py
index c95fbed..3b950d4 100644
--- a/mmdet3d/version.py
+++ b/mmdet3d/version.py
@@ -1,19 +1,19 @@
-# Copyright (c) Open-MMLab. All rights reserved.
-
-__version__ = '1.0.0rc3'
-short_version = __version__
-
-
-def parse_version_info(version_str):
-    version_info = []
-    for x in version_str.split('.'):
-        if x.isdigit():
-            version_info.append(int(x))
-        elif x.find('rc') != -1:
-            patch_version = x.split('rc')
-            version_info.append(int(patch_version[0]))
-            version_info.append(f'rc{patch_version[1]}')
-    return tuple(version_info)
-
-
-version_info = parse_version_info(__version__)
+# Copyright (c) Open-MMLab. All rights reserved.
+
+__version__ = '1.0.0rc3'
+short_version = __version__
+
+
+def parse_version_info(version_str):
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
diff --git a/model/model_deployor/deployor.py b/model/model_deployor/deployor.py
index 02ad884..3b90380 100644
--- a/model/model_deployor/deployor.py
+++ b/model/model_deployor/deployor.py
@@ -1,44 +1,9 @@
 import torch
+from model.model_deployor.deployor_utils import trt_input_shapes
 import importlib
 if importlib.util.find_spec('tensorrt') is not None:
-    from model.model_deployor.onnx2tensorrt import create_trt_engine, save_trt_engine
+    from model.model_deployor.deployor_utils import create_trt_engine, save_trt_engine
 
-trt_input_shapes = {
-    'kitti': {
-        'voxels': {
-            'min_shape': [2000, 32, 4],
-            'opt_shape': [5000, 32, 4],
-            'max_shape': [9000, 32, 4]
-        },
-        'num_points': {
-            'min_shape': [2000],
-            'opt_shape': [5000],
-            'max_shape': [9000]
-        },
-        'coors': {
-            'min_shape': [2000, 4],
-            'opt_shape': [5000, 4],
-            'max_shape': [9000, 4]
-        }
-    },
-    'nuscenes': {
-        'voxels': {
-            'min_shape': [5000, 20, 4],
-            'opt_shape': [20000, 20, 4],
-            'max_shape': [30000, 20, 4]
-        },
-        'num_points': {
-            'min_shape': [5000],
-            'opt_shape': [20000],
-            'max_shape': [30000]
-        },
-        'coors': {
-            'min_shape': [5000, 4],
-            'opt_shape': [20000, 4],
-            'max_shape': [30000, 4]
-        }
-    }
-}
 
 def deploy(model,
            model_inputs,
diff --git a/model/model_deployor/deployor_utils.py b/model/model_deployor/deployor_utils.py
index 78bfb3b..be78c94 100644
--- a/model/model_deployor/deployor_utils.py
+++ b/model/model_deployor/deployor_utils.py
@@ -5,6 +5,16 @@
 import numpy as np
 from mmdet3d.datasets.pipelines import Compose
 from mmdet3d.core.bbox import get_box_type
+from typing import Dict, Sequence, Union
+import importlib
+if importlib.util.find_spec('tensorrt') is not None:
+    import tensorrt as trt
+
+import onnx
+import torch
+from packaging import version
+
+
 
 test_pipelines = {
     'pointpillars': {
@@ -91,7 +101,6 @@
     }
 }
 
-
 voxel_layers = {
     'pointpillars': {
         'kitti': dict(
@@ -114,6 +123,43 @@
     }
 }
 
+trt_input_shapes = {
+    'kitti': {
+        'voxels': {
+            'min_shape': [2000, 32, 4],
+            'opt_shape': [5000, 32, 4],
+            'max_shape': [9000, 32, 4]
+        },
+        'num_points': {
+            'min_shape': [2000],
+            'opt_shape': [5000],
+            'max_shape': [9000]
+        },
+        'coors': {
+            'min_shape': [2000, 4],
+            'opt_shape': [5000, 4],
+            'max_shape': [9000, 4]
+        }
+    },
+    'nuscenes': {
+        'voxels': {
+            'min_shape': [5000, 20, 4],
+            'opt_shape': [20000, 20, 4],
+            'max_shape': [30000, 20, 4]
+        },
+        'num_points': {
+            'min_shape': [5000],
+            'opt_shape': [20000],
+            'max_shape': [30000]
+        },
+        'coors': {
+            'min_shape': [5000, 4],
+            'opt_shape': [20000, 4],
+            'max_shape': [30000, 4]
+        }
+    }
+}
+
 def create_input(pcd, dataset, model, device):
     """Create input for detector.
 
@@ -201,4 +247,189 @@ def voxelize(voxel_layer, points):
         coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
         coors_batch.append(coor_pad)
     coors_batch = torch.cat(coors_batch, dim=0)
-    return voxels, num_points, coors_batch
\ No newline at end of file
+    return voxels, num_points, coors_batch
+
+def create_trt_engine(onnx_model: Union[str, onnx.ModelProto],
+                      input_shapes: Dict[str, Sequence[int]],
+                      log_level,
+                      fp16_mode: bool = False,
+                      int8_mode: bool = False,
+                      int8_param: dict = None,
+                      max_workspace_size: int = 0,
+                      device_id: int = 0,
+                      **kwargs):
+    """Create a tensorrt engine from ONNX.
+
+    Args:
+        onnx_model (str or onnx.ModelProto): Input onnx model to convert from.
+        input_shapes (Dict[str, Sequence[int]]): The min/opt/max shape of
+            each input.
+        log_level (trt.Logger.Severity): The log level of TensorRT. Defaults to
+            `trt.Logger.INFO`.
+        fp16_mode (bool): Specifying whether to enable fp16 mode.
+            Defaults to `False`.
+        int8_mode (bool): Specifying whether to enable int8 mode.
+            Defaults to `False`.
+        int8_param (dict): A dict of parameter  int8 mode. Defaults to `None`.
+        max_workspace_size (int): To set max workspace size of TensorRT engine.
+            some tactics and layers need large workspace. Defaults to `0`.
+        device_id (int): Choice the device to create engine. Defaults to `0`.
+
+    Returns:
+        tensorrt.ICudaEngine: The TensorRT engine created from onnx_model.
+
+    Example:
+        >>> engine = create_trt_engine(
+        >>>             "onnx_model.onnx",
+        >>>             {'input': {"min_shape" : [1, 3, 160, 160],
+        >>>                        "opt_shape" : [1, 3, 320, 320],
+        >>>                        "max_shape" : [1, 3, 640, 640]}},
+        >>>             log_level=trt.Logger.WARNING,
+        >>>             fp16_mode=True,
+        >>>             max_workspace_size=1 << 30,
+        >>>             device_id=0)
+        >>>             })
+    """
+    device = torch.device('cuda:{}'.format(device_id))
+    # create builder and network
+    logger = trt.Logger(log_level)
+    builder = trt.Builder(logger)
+    EXPLICIT_BATCH = 1 << (int)(
+        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+
+    # parse onnx
+    parser = trt.OnnxParser(network, logger)
+
+    if isinstance(onnx_model, str):
+        onnx_model = onnx.load(onnx_model)
+
+    if not parser.parse(onnx_model.SerializeToString()):
+        error_msgs = ''
+        for error in range(parser.num_errors):
+            error_msgs += f'{parser.get_error(error)}\n'
+        raise RuntimeError(f'Failed to parse onnx, {error_msgs}')
+
+    # config builder
+    if version.parse(trt.__version__) < version.parse('8'):
+        builder.max_workspace_size = max_workspace_size
+
+    config = builder.create_builder_config()
+    config.max_workspace_size = max_workspace_size
+
+    if onnx_model.graph.input[0].type.tensor_type.shape.dim[0].dim_value == 0:
+        profile = builder.create_optimization_profile()
+
+        for input_name, param in input_shapes.items():
+            min_shape = param['min_shape']
+            opt_shape = param['opt_shape']
+            max_shape = param['max_shape']
+            profile.set_shape(input_name, min_shape, opt_shape, max_shape)
+        config.add_optimization_profile(profile)
+
+    if fp16_mode:
+        if version.parse(trt.__version__) < version.parse('8'):
+            builder.fp16_mode = fp16_mode
+        config.set_flag(trt.BuilderFlag.FP16)
+
+    if int8_mode:
+        config.set_flag(trt.BuilderFlag.INT8)
+        assert int8_param is not None
+        config.int8_calibrator = HDF5Calibrator(
+            int8_param['calib_file'],
+            input_shapes,
+            model_type=int8_param['model_type'],
+            device_id=device_id,
+            algorithm=int8_param.get(
+                'algorithm', trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2))
+        if version.parse(trt.__version__) < version.parse('8'):
+            builder.int8_mode = int8_mode
+            builder.int8_calibrator = config.int8_calibrator
+
+    # create engine
+    with torch.cuda.device(device):
+        engine = builder.build_engine(network, config)
+
+    assert engine is not None, 'Failed to create TensorRT engine'
+    return engine
+
+def save_trt_engine(engine, path: str) -> None:
+    """Serialize TensorRT engine to disk.
+
+    Args:
+        engine (tensorrt.ICudaEngine): TensorRT engine to be serialized.
+        path (str): The absolute disk path to write the engine.
+    """
+    with open(path, mode='wb') as f:
+        f.write(bytearray(engine.serialize()))
+
+def load_trt_engine(path: str):
+    """Deserialize TensorRT engine from disk.
+
+    Args:
+        path (str): The disk path to read the engine.
+
+    Returns:
+        tensorrt.ICudaEngine: The TensorRT engine loaded from disk.
+    """
+    with trt.Logger() as logger, trt.Runtime(logger) as runtime:
+        with open(path, mode='rb') as f:
+            engine_bytes = f.read()
+        engine = runtime.deserialize_cuda_engine(engine_bytes)
+        return engine
+
+def torch_dtype_from_trt(dtype) -> torch.dtype:
+    """Convert pytorch dtype to TensorRT dtype.
+
+    Args:
+        dtype (str.DataType): The data type in tensorrt.
+
+    Returns:
+        torch.dtype: The corresponding data type in torch.
+    """
+
+    if dtype == trt.bool:
+        return torch.bool
+    elif dtype == trt.int8:
+        return torch.int8
+    elif dtype == trt.int32:
+        return torch.int32
+    elif dtype == trt.float16:
+        return torch.float16
+    elif dtype == trt.float32:
+        return torch.float32
+    else:
+        raise TypeError(f'{dtype} is not supported by torch')
+
+def torch_device_from_trt(device):
+    """Convert pytorch device to TensorRT device.
+
+    Args:
+        device (trt.TensorLocation): The device in tensorrt.
+    Returns:
+        torch.device: The corresponding device in torch.
+    """
+    if device == trt.TensorLocation.DEVICE:
+        return torch.device('cuda')
+    elif device == trt.TensorLocation.HOST:
+        return torch.device('cpu')
+    else:
+        return TypeError(f'{device} is not supported by torch')
+
+def print_trt_engine(engine):
+    """
+    Print trt engine information.
+
+    Args:
+        tensorrt.ICudaEngine: The TensorRT engine loaded from disk.
+    Returns:
+        None
+    """
+    for idx in range(engine.num_bindings):
+        is_input = engine.binding_is_input(idx)
+        name = engine.get_binding_name(idx)
+        op_type = engine.get_binding_dtype(idx)
+        shape = engine.get_binding_shape(idx)
+
+        print('input id:', idx, '   is input: ', is_input, '  binding name:', name, '  shape:', shape, 'type: ',
+              op_type)
\ No newline at end of file
diff --git a/model/model_deployor/onnx2tensorrt.py b/model/model_deployor/onnx2tensorrt.py
deleted file mode 100644
index ef0482e..0000000
--- a/model/model_deployor/onnx2tensorrt.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, Sequence, Union
-import importlib
-if importlib.util.find_spec('tensorrt') is not None:
-    import tensorrt as trt
-
-import onnx
-import torch
-from packaging import version
-
-def create_trt_engine(onnx_model: Union[str, onnx.ModelProto],
-                      input_shapes: Dict[str, Sequence[int]],
-                      log_level: trt.Logger.Severity = trt.Logger.INFO,
-                      fp16_mode: bool = False,
-                      int8_mode: bool = False,
-                      int8_param: dict = None,
-                      max_workspace_size: int = 0,
-                      device_id: int = 0,
-                      **kwargs) -> trt.ICudaEngine:
-    """Create a tensorrt engine from ONNX.
-
-    Args:
-        onnx_model (str or onnx.ModelProto): Input onnx model to convert from.
-        input_shapes (Dict[str, Sequence[int]]): The min/opt/max shape of
-            each input.
-        log_level (trt.Logger.Severity): The log level of TensorRT. Defaults to
-            `trt.Logger.INFO`.
-        fp16_mode (bool): Specifying whether to enable fp16 mode.
-            Defaults to `False`.
-        int8_mode (bool): Specifying whether to enable int8 mode.
-            Defaults to `False`.
-        int8_param (dict): A dict of parameter  int8 mode. Defaults to `None`.
-        max_workspace_size (int): To set max workspace size of TensorRT engine.
-            some tactics and layers need large workspace. Defaults to `0`.
-        device_id (int): Choice the device to create engine. Defaults to `0`.
-
-    Returns:
-        tensorrt.ICudaEngine: The TensorRT engine created from onnx_model.
-
-    Example:
-        >>> engine = create_trt_engine(
-        >>>             "onnx_model.onnx",
-        >>>             {'input': {"min_shape" : [1, 3, 160, 160],
-        >>>                        "opt_shape" : [1, 3, 320, 320],
-        >>>                        "max_shape" : [1, 3, 640, 640]}},
-        >>>             log_level=trt.Logger.WARNING,
-        >>>             fp16_mode=True,
-        >>>             max_workspace_size=1 << 30,
-        >>>             device_id=0)
-        >>>             })
-    """
-    device = torch.device('cuda:{}'.format(device_id))
-    # create builder and network
-    logger = trt.Logger(log_level)
-    builder = trt.Builder(logger)
-    EXPLICIT_BATCH = 1 << (int)(
-        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-    network = builder.create_network(EXPLICIT_BATCH)
-
-    # parse onnx
-    parser = trt.OnnxParser(network, logger)
-
-    if isinstance(onnx_model, str):
-        onnx_model = onnx.load(onnx_model)
-
-    if not parser.parse(onnx_model.SerializeToString()):
-        error_msgs = ''
-        for error in range(parser.num_errors):
-            error_msgs += f'{parser.get_error(error)}\n'
-        raise RuntimeError(f'Failed to parse onnx, {error_msgs}')
-
-    # config builder
-    if version.parse(trt.__version__) < version.parse('8'):
-        builder.max_workspace_size = max_workspace_size
-
-    config = builder.create_builder_config()
-    config.max_workspace_size = max_workspace_size
-
-    if onnx_model.graph.input[0].type.tensor_type.shape.dim[0].dim_value == 0:
-        profile = builder.create_optimization_profile()
-
-        for input_name, param in input_shapes.items():
-            min_shape = param['min_shape']
-            opt_shape = param['opt_shape']
-            max_shape = param['max_shape']
-            profile.set_shape(input_name, min_shape, opt_shape, max_shape)
-        config.add_optimization_profile(profile)
-
-    if fp16_mode:
-        if version.parse(trt.__version__) < version.parse('8'):
-            builder.fp16_mode = fp16_mode
-        config.set_flag(trt.BuilderFlag.FP16)
-
-    if int8_mode:
-        config.set_flag(trt.BuilderFlag.INT8)
-        assert int8_param is not None
-        config.int8_calibrator = HDF5Calibrator(
-            int8_param['calib_file'],
-            input_shapes,
-            model_type=int8_param['model_type'],
-            device_id=device_id,
-            algorithm=int8_param.get(
-                'algorithm', trt.CalibrationAlgoType.ENTROPY_CALIBRATION_2))
-        if version.parse(trt.__version__) < version.parse('8'):
-            builder.int8_mode = int8_mode
-            builder.int8_calibrator = config.int8_calibrator
-
-    # create engine
-    with torch.cuda.device(device):
-        engine = builder.build_engine(network, config)
-
-    assert engine is not None, 'Failed to create TensorRT engine'
-    return engine
-
-
-
-def save_trt_engine(engine: trt.ICudaEngine, path: str) -> None:
-    """Serialize TensorRT engine to disk.
-
-    Args:
-        engine (tensorrt.ICudaEngine): TensorRT engine to be serialized.
-        path (str): The absolute disk path to write the engine.
-    """
-    with open(path, mode='wb') as f:
-        f.write(bytearray(engine.serialize()))
-
-
-def load_trt_engine(path: str) -> trt.ICudaEngine:
-    """Deserialize TensorRT engine from disk.
-
-    Args:
-        path (str): The disk path to read the engine.
-
-    Returns:
-        tensorrt.ICudaEngine: The TensorRT engine loaded from disk.
-    """
-    with trt.Logger() as logger, trt.Runtime(logger) as runtime:
-        with open(path, mode='rb') as f:
-            engine_bytes = f.read()
-        engine = runtime.deserialize_cuda_engine(engine_bytes)
-        return engine
-
-def torch_dtype_from_trt(dtype: trt.DataType) -> torch.dtype:
-    """Convert pytorch dtype to TensorRT dtype.
-
-    Args:
-        dtype (str.DataType): The data type in tensorrt.
-
-    Returns:
-        torch.dtype: The corresponding data type in torch.
-    """
-
-    if dtype == trt.bool:
-        return torch.bool
-    elif dtype == trt.int8:
-        return torch.int8
-    elif dtype == trt.int32:
-        return torch.int32
-    elif dtype == trt.float16:
-        return torch.float16
-    elif dtype == trt.float32:
-        return torch.float32
-    else:
-        raise TypeError(f'{dtype} is not supported by torch')
-
-def torch_device_from_trt(device: trt.TensorLocation):
-    """Convert pytorch device to TensorRT device.
-
-    Args:
-        device (trt.TensorLocation): The device in tensorrt.
-    Returns:
-        torch.device: The corresponding device in torch.
-    """
-    if device == trt.TensorLocation.DEVICE:
-        return torch.device('cuda')
-    elif device == trt.TensorLocation.HOST:
-        return torch.device('cpu')
-    else:
-        return TypeError(f'{device} is not supported by torch')
-
-def print_trt_engine(engine: trt.ICudaEngine):
-    """
-    Print trt engine information.
-
-    Args:
-        tensorrt.ICudaEngine: The TensorRT engine loaded from disk.
-    Returns:
-        None
-    """
-    for idx in range(engine.num_bindings):
-        is_input = engine.binding_is_input(idx)
-        name = engine.get_binding_name(idx)
-        op_type = engine.get_binding_dtype(idx)
-        shape = engine.get_binding_shape(idx)
-
-        print('input id:', idx, '   is input: ', is_input, '  binding name:', name, '  shape:', shape, 'type: ',
-              op_type)
\ No newline at end of file
diff --git a/test/test_engine/test_engine.py b/test/test_engine/test_engine.py
index c3b7f48..bf84549 100644
--- a/test/test_engine/test_engine.py
+++ b/test/test_engine/test_engine.py
@@ -3,28 +3,26 @@
 
 from deephub.detection_model import Pointpillars
 from model.model_deployor.deployor_utils import create_input
-from engine.pointpillars_engine import Pointpillars_engine
+from engine.engine_utils import Engine
 
 pretrain_model = 'checkpoints/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20220331_134606-d42d15ed.pth'
 pcd = 'test/data_tobe_tested/kitti/kitti_000008.bin'
 device = 'cuda:0'
 
+data, model_inputs = create_input(pcd, 'kitti', 'pointpillars',
+                                          device)
+torch_model = Pointpillars()
 
 class TestEngine(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        data, model_inputs = create_input(pcd, 'kitti', 'pointpillars',
-                                          device)
-        cls.data = data
-        cls.model_inputs = model_inputs
-        torch_model = Pointpillars()
-        cls.torch_model = torch_model
+        pass
 
     # noinspection DuplicatedCode
     def test_engine_infer(self):
         # warp engine
-        model = Pointpillars_engine(self.torch_model)
+        model = Engine(torch_model)
         # load pretrain model
         checkpoint = torch.load(pretrain_model)
         model.torch_model.load_state_dict(checkpoint["state_dict"])
@@ -34,7 +32,7 @@ def test_engine_infer(self):
         model.cuda()
         model.eval()
 
-        predict = model(self.data['img_metas'][0], self.data['points'][0])
+        predict = model(data['img_metas'][0], data['points'][0])
 
         # test
         assert len(predict['scores_3d']) != 0
diff --git a/test/test_model_ops/test_deployor.py b/test/test_model_ops/test_deployor.py
index 8fa627b..6422cf5 100644
--- a/test/test_model_ops/test_deployor.py
+++ b/test/test_model_ops/test_deployor.py
@@ -5,8 +5,8 @@
 import onnxruntime
 from deephub.detection_model import Pointpillars
 from model.model_deployor.deployor import deploy
-from model.model_deployor.deployor_utils import create_input_pointpillars
-from model.model_deployor.onnx2tensorrt import load_trt_engine, torch_dtype_from_trt, torch_device_from_trt
+from model.model_deployor.deployor_utils import create_input
+from model.model_deployor.deployor_utils import load_trt_engine, torch_dtype_from_trt, torch_device_from_trt
 
 pcd = 'test/data_tobe_tested/kitti/kitti_000008.bin'
 device = 'cuda:0'
@@ -17,34 +17,32 @@
                     'coors': {0: 'voxels_num'}}
 # dynamic_axes = None
 fp16 = False
-
+data, model_inputs = create_input(pcd, 'kitti', 'pointpillars', device)
+model = Pointpillars()
+model.cuda()
+model.eval()
 
 class TestModelDeployor(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        data, model_inputs = create_input_pointpillars(pcd, 'kitti', device)
-        cls.model_inputs = model_inputs
-        model = Pointpillars()
-        model.cuda()
-        model.eval()
-        cls.model = model
+        pass
 
     # noinspection DuplicatedCode
     def test_deployor_onnx(self):
         # Compute Pytorch model outputs
-        torch_out = self.model(self.model_inputs[0], self.model_inputs[1], self.model_inputs[2])
+        torch_out = model(model_inputs[0], model_inputs[1], model_inputs[2])
         # deploy ONNX
-        backend_file = deploy(self.model, self.model_inputs, input_names, output_names, dynamic_axes, backend='onnxruntime',
+        backend_file = deploy(model, model_inputs, input_names, output_names, dynamic_axes, backend='onnxruntime',
                               output_file='end2end', fp16=fp16)
 
         # Compute ONNX model outputs
         ort_session = onnxruntime.InferenceSession(backend_file)
 
         input_dict = {}
-        input_dict['voxels'] = self.model_inputs[0].cpu().numpy()
-        input_dict['num_points'] = self.model_inputs[1].cpu().numpy()
-        input_dict['coors'] = self.model_inputs[2].cpu().numpy()
+        input_dict['voxels'] = model_inputs[0].cpu().numpy()
+        input_dict['num_points'] = model_inputs[1].cpu().numpy()
+        input_dict['coors'] = model_inputs[2].cpu().numpy()
         ort_output = ort_session.run(['scores', 'bbox_preds', 'dir_scores'], input_dict)
 
         outputs = {}
@@ -65,9 +63,9 @@ def test_deployor_onnx(self):
 
     def test_deployor_trt(self):
         # Compute Pytorch model outputs
-        torch_out = self.model(self.model_inputs[0], self.model_inputs[1], self.model_inputs[2])
+        torch_out = model(model_inputs[0], model_inputs[1], model_inputs[2])
         # deploy TensorRT
-        backend_file = deploy(self.model, self.model_inputs, input_names, output_names, dynamic_axes,
+        backend_file = deploy(model, model_inputs, input_names, output_names, dynamic_axes,
                               backend='tensorrt',
                               output_file='end2end', fp16=fp16)
 
@@ -78,9 +76,9 @@ def test_deployor_trt(self):
         input_names_trt = list(filter(engine.binding_is_input, names))
         output_names_trt = list(set(names) - set(input_names_trt))
         input_dict = {
-            'voxels': self.model_inputs[0],
-            'num_points': self.model_inputs[1],
-            'coors': self.model_inputs[2]
+            'voxels': model_inputs[0],
+            'num_points': model_inputs[1],
+            'coors': model_inputs[2]
         }
         bindings = [None] * (len(input_names_trt) + len(output_names_trt))
 
diff --git a/tools/deploy.py b/tools/deploy.py
index 2cfdc8b..49459ab 100644
--- a/tools/deploy.py
+++ b/tools/deploy.py
@@ -10,7 +10,7 @@
 from model.model_deployor.deployor_utils import create_input
 import importlib
 if importlib.util.find_spec('tensorrt') is not None:
-    from model.model_deployor.onnx2tensorrt import load_trt_engine, torch_dtype_from_trt, torch_device_from_trt
+    from model.model_deployor.deployor_utils import load_trt_engine, torch_dtype_from_trt, torch_device_from_trt
 else:
     print('Please install TensorRT if you want to convert')
 
diff --git a/tools/engines.py b/tools/engines.py
index af007d1..41cabd6 100644
--- a/tools/engines.py
+++ b/tools/engines.py
@@ -7,7 +7,7 @@
 from mmdet3d.datasets import build_dataset
 
 from model.model_deployor.deployor_utils import create_input
-from engine.pointpillars_engine import Pointpillars_engine
+from engine.engine_utils import Engine
 from deephub.detection_model import Pointpillars
 from engine import fit, eval, predict, inference
 
@@ -70,7 +70,7 @@ def main(args):
 
     # get model params
     if temp_args.model_name == "pointpillars":
-        parser = Pointpillars_engine.add_model_specific_args(parser)
+        parser = Engine.add_model_specific_args(parser)
 
     # # Add pytorch lightning's args to parser as a group.
     # parser = Trainer.add_argparse_args(parser)