Merge remote-tracking branch 'upstream/dev-1.x' into add-transfusion-…

…latest
open-mmlab · Jan 14, 2024 · 2587d4f · 2587d4f
2 parents 2cb5f78 + 0ef13b8
commit 2587d4f
Show file tree

Hide file tree

Showing 120 changed files with 8,621 additions and 2,095 deletions.
diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile
@@ -2,6 +2,8 @@ ARG PYTORCH="1.8.1"
 ARG CUDA="10.2"
 ARG CUDNN="7"
 
+ARG DEBIAN_FRONTEND=noninteractive
+
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 
 # To fix GPG key error when running apt-get update

diff --git a/.circleci/test.yml b/.circleci/test.yml
@@ -85,16 +85,22 @@ jobs:
         type: string
       cuda:
         type: enum
-        enum: ["10.1", "10.2", "11.1", "11.7"]
+        enum: ["10.2", "11.7"]
       cudnn:
         type: integer
-        default: 7
+        default: 8
     machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
+      image: linux-cuda-11:default
       # docker_layer_caching: true
-    resource_class: gpu.nvidia.small
+    resource_class: gpu.nvidia.small.multi
     steps:
       - checkout
+      - run:
+          name: Install nvidia-container-toolkit and Restart Docker
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y nvidia-container-toolkit
+            sudo systemctl restart docker
       - run:
           # Cloning repos in VM since Docker doesn't have access to the private key
           name: Clone Repos
@@ -168,6 +174,7 @@ workflows:
           # Use double quotation mark to explicitly specify its type
           # as string instead of number
           cuda: "10.2"
+          cudnn: 7
           requires:
             - hold
       - build_cuda:
@@ -185,6 +192,7 @@ workflows:
           name: minimum_version_gpu
           torch: 1.8.1
           cuda: "10.2"
+          cudnn: 7
           filters:
             branches:
               only:

diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml
@@ -59,7 +59,7 @@ jobs:
     strategy:
       matrix:
         python-version: [3.7]
-        torch: [1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.0, 1.13.0, 2.0.0]
+        torch: [1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.0, 1.13.0]
         include:
           - torch: 1.8.1
             torchvision: 0.9.1
@@ -73,9 +73,9 @@ jobs:
             torchvision: 0.13.0
           - torch: 1.13.0
             torchvision: 0.14.0
-          - torch: 2.0.0
+          - python-version: 3.8
+            torch: 2.0.0
             torchvision: 0.15.1
-            python-version: 3.8
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}

diff --git a/.gitignore b/.gitignore
@@ -134,3 +134,4 @@ data/sunrgbd/OFFICIAL_SUNRGBD/
 # Waymo evaluation
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_metrics_main
 mmdet3d/evaluation/functional/waymo_utils/compute_detection_let_metrics_main
+mmdet3d/evaluation/functional/waymo_utils/compute_segmentation_metrics_main
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -1,10 +1,14 @@
 version: 2
 
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"
+
 formats:
   - epub
 
 python:
-  version: 3.7
   install:
     - requirements: requirements/docs.txt
     - requirements: requirements/readthedocs.txt
diff --git a/README.md b/README.md
@@ -104,13 +104,24 @@ Like [MMDetection](https://github.com/open-mmlab/mmdetection) and [MMCV](https:/
 
 ### Highlight
 
-**We have renamed the branch `1.1` to `main` and switched the default branch from `master` to `main`. We encourage users to migrate to the latest version, though it comes with some cost. Please refer to [Migration Guide](docs/en/migration.md) for more details.**
+In version 1.4, MMDetecion3D refactors the Waymo dataset and accelerates the preprocessing, training/testing setup, and evaluation of Waymo dataset. We also extends the support for camera-based, such as Monocular and BEV, 3D object detection models on Waymo. A detailed description of the Waymo data information is provided [here](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html).
 
-We have constructed a comprehensive LiDAR semantic segmentation benchmark on SemanticKITTI, including Cylinder3D, MinkUNet and SPVCNN methods. Noteworthy, the improved MinkUNetv2 can achieve 70.3 mIoU on the validation set of SemanticKITTI. We have also supported the training of BEVFusion and an occupancy prediction method, TPVFomrer, in our `projects`. More new features about 3D perception are on the way. Please stay tuned!
+Besides, in version 1.4, MMDetection3D provides [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) to help community users get started with Waymo and use it for quick iterative development.
+
+**v1.4.0** was released in 8/1/2024：
+
+- Support the training of [DSVT](<(https://arxiv.org/abs/2301.06051)>) in `projects`
+- Support [Nerf-Det](https://arxiv.org/abs/2307.14620) in `projects`
+- Refactor Waymo dataset
+
+**v1.3.0** was released in 18/10/2023:
+
+- Support [CENet](https://arxiv.org/abs/2207.12691) in `projects`
+- Enhance demos with new 3D inferencers
 
 **v1.2.0** was released in 4/7/2023
 
-- Support [New Config Type](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) in `mmdet3d/config`
+- Support [New Config Type](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta) in `mmdet3d/configs`
 - Support the inference of [DSVT](<(https://arxiv.org/abs/2301.06051)>) in `projects`
 - Support downloading datasets from [OpenDataLab](https://opendatalab.com/) using `mim`
 

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -104,13 +104,24 @@ MMDetection3D 是一个基于 PyTorch 的目标检测开源工具箱，下一代
 
 ### 亮点
 
-**我们将 `1.1` 分支重命名为 `main` 并将默认分支从 `master` 切换到 `main`。我们鼓励用户迁移到最新版本，请参考 [迁移指南](docs/en/migration.md)以了解更多细节。**
+在1.4版本中，MMDetecion3D 重构了 Waymo 数据集, 加速了 Waymo 数据集的预处理、训练/测试启动、验证的速度。并且在 Waymo 上拓展了对 单目/BEV 等基于相机的三维目标检测模型的支持。在[这里](https://mmdetection3d.readthedocs.io/en/latest/advanced_guides/datasets/waymo.html)提供了对 Waymo 数据信息的详细解读。
 
-我们在 SemanticKITTI 上构建了一个全面的点云语义分割基准，包括 Cylinder3D 、MinkUNet 和 SPVCNN 方法。其中，改进后的 MinkUNetv2 在验证集上可以达到 70.3 mIoU。我们还在 `projects` 中支持了 BEVFusion 的训练和全新的 3D 占有网格预测网络 TPVFormer。更多关于 3D 感知的新功能正在进行中。请继续关注！
+此外，在1.4版本中，MMDetection3D 提供了 [Waymo-mini](https://download.openmmlab.com/mmdetection3d/data/waymo_mmdet3d_after_1x4/waymo_mini.tar.gz) 来帮助社区用户上手 Waymo 并用于快速迭代开发。
+
+**v1.4.0** 版本已经在 2024.1.8 发布：
+
+- 在 `projects` 中支持了 [DSVT](<(https://arxiv.org/abs/2301.06051)>) 的训练
+- 在 `projects` 中支持了 [Nerf-Det](https://arxiv.org/abs/2307.14620)
+- 重构了 Waymo 数据集
+
+**v1.3.0** 版本已经在 2023.10.18 发布：
+
+- 在 `projects` 中支持 [CENet](https://arxiv.org/abs/2207.12691)
+- 使用新的 3D inferencers 增强演示代码效果
 
 **v1.2.0** 版本已经在 2023.7.4 发布：
 
-- 在 `mmdet3d/config`中支持 [新Config样式](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta)
+- 在 `mmdet3d/configs`中支持 [新Config样式](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/config.html#a-pure-python-style-configuration-file-beta)
 - 在 `projects` 中支持 [DSVT](<(https://arxiv.org/abs/2301.06051)>) 的推理
 - 支持通过 `mim` 从 [OpenDataLab](https://opendatalab.com/) 下载数据集
 
@@ -166,8 +177,6 @@ MMDetection3D 是一个基于 PyTorch 的目标检测开源工具箱，下一代
 
 ## 基准测试和模型库
 
-## 基准测试和模型库
-
 测试结果和模型可以在[模型库](docs/zh_cn/model_zoo.md)中找到。
 
 <div align="center">
@@ -400,10 +409,10 @@ MMDetection3D 是一款由来自不同高校和企业的研发人员共同参与
 
 ## 欢迎加入 OpenMMLab 社区
 
-扫描下方的二维码可关注 OpenMMLab 团队的[知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 OpenMMLab 团队的[官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=K0QI8ByU)，或通过添加微信“Open小喵Lab”加入官方交流微信群。
+扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，扫描下方微信二维码添加喵喵好友，进入 MMDetection3D 微信交流社群。【加好友申请格式：研究方向+地区+学校/公司+姓名】
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/58739961/187154320-f3312cdf-31f2-4316-9dbb-8d7b0e1b7e08.jpg" height="400" />  <img src="https://user-images.githubusercontent.com/25839884/203904835-62392033-02d4-4c73-a68c-c9e4c1e2b07f.jpg" height="400" />  <img src="https://user-images.githubusercontent.com/58739961/187151778-d17c1368-125f-4fde-adbe-38cc6eb3be98.jpg" height="400" />
+<img src="https://user-images.githubusercontent.com/58739961/187154320-f3312cdf-31f2-4316-9dbb-8d7b0e1b7e08.jpg" height="400" />  <img src="https://github.com/open-mmlab/mmdetection3d/assets/62195058/dfb3f6a9-25c6-47a5-936b-3f1d7347a42b" height="400" />
 </div>
 
 我们会在 OpenMMLab 社区为大家

diff --git a/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py b/configs/_base_/datasets/waymoD3-fov-mono3d-3class.py
@@ -0,0 +1,184 @@
+# dataset settings
+# D3 in the config name means the whole dataset is divided into 3 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+metainfo = dict(classes=class_names)
+input_modality = dict(use_lidar=False, use_camera=True)
+
+# Example to use different file client
+# Method 1: simply set the data root and let the file I/O module
+# automatically infer from prefix (not support LMDB and Memcache yet)
+
+# data_root = 's3://openmmlab/datasets/detection3d/waymo/kitti_format/'
+
+# Method 2: Use backend_args, file_client_args in versions before 1.1.0
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/': 's3://openmmlab/datasets/detection3d/',
+#          'data/': 's3://openmmlab/datasets/detection3d/'
+#      }))
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    # base shape (1248, 832), scale (0.95, 1.05)
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(0.95, 1.05),
+        # ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True,
+    ),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='RandomResize3D',
+        scale=(1248, 832),
+        ratio_range=(1., 1.),
+        interpolation='nearest',
+        keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img'],
+        meta_keys=[
+            'box_type_3d', 'img_shape', 'cam2img', 'scale_factor',
+            'sample_idx', 'context_name', 'timestamp', 'lidar2cam'
+        ]),
+]
+
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=3,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        # load one frame every three frames
+        load_interval=3,
+        backend_args=backend_args))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        load_eval_anns=False,
+        backend_args=backend_args))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            pts='training/velodyne',
+            CAM_FRONT='training/image_0',
+            CAM_FRONT_LEFT='training/image_1',
+            CAM_FRONT_RIGHT='training/image_2',
+            CAM_SIDE_LEFT='training/image_3',
+            CAM_SIDE_RIGHT='training/image_4'),
+        ann_file='waymo_infos_val.pkl',
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        metainfo=metainfo,
+        cam_sync_instances=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='Camera',
+        load_type='fov_image_based',
+        backend_args=backend_args))
+
+val_evaluator = dict(
+    type='WaymoMetric',
+    waymo_bin_file='./data/waymo/waymo_format/fov_gt.bin',
+    metric='LET_mAP',
+    load_type='fov_image_based',
+    result_prefix='./pgd_fov_pred')
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')