diff --git a/configs/localization/bmn/README.md b/configs/localization/bmn/README.md index ec2f625a95..f30b3a5d40 100644 --- a/configs/localization/bmn/README.md +++ b/configs/localization/bmn/README.md @@ -23,11 +23,12 @@ Temporal action proposal generation is an challenging and promising task which a | feature | gpus | pretrain | AUC | AR@1 | AR@5 | AR@10 | AR@100 | gpu_mem(M) | iter time(s) | config | ckpt | log | | :-----------: | :--: | :------: | :---: | :---: | :---: | :---: | :----: | :--------: | :----------: | :------------------------------------------: | :----------------------------------------: | :---------------------------------------: | | cuhk_mean_100 | 2 | None | 67.25 | 32.89 | 49.43 | 56.64 | 75.29 | 5412 | - | [config](/configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature_20220908-79f92857.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.log) | +| slowonly-k700 | 2 | None | 68.04 | 33.44 | 50.53 | 57.65 | 75.77 | - | - | [config](/configs/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature_20230907-50b939b2.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.log) | 1. The **gpus** indicates the number of gpu we used to get the checkpoint. According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. -2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk). +2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk). The slowonly-k700 denotes the feature extracted using MMAction2's [SlowOnly model trained on Kinetics 700](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.py). You can download this feature from [ActivityNet Data Preparation](/tools/data/activitynet/README.md). 3. We evaluate the action detection performance of BMN, using [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) submission for ActivityNet2017 Untrimmed Video Classification Track to assign label for each action proposal. \*We train BMN with the [official repo](https://github.com/JJBOY/BMN-Boundary-Matching-Network), evaluate its proposal generation and action detection performance with [anet_cuhk_2017](https://download.openmmlab.com/mmaction/localization/cuhk_anet17_pred.json) for label assigning. @@ -42,6 +43,12 @@ Train BMN model on ActivityNet features dataset. bash tools/dist_train.sh configs/localization/bmn/bmn_2xb8-400x100-9e_activitynet-feature.py 2 ``` +Train BMN model on ActivityNet SlowOnly-K700 features dataset. + +```shell +bash tools/dist_train.sh configs/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.py 2 +``` + For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/train_test.md). ## Test diff --git a/configs/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.py b/configs/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.py new file mode 100644 index 0000000000..9230578a86 --- /dev/null +++ b/configs/localization/bmn/bmn_2xb8-2048x100-9e_activitynet-slowonly-k700-feature.py @@ -0,0 +1,110 @@ +_base_ = [ + '../../_base_/models/bmn_400x100.py', '../../_base_/default_runtime.py' +] + +model = dict(feat_dim=2048) + +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/k700slowonly' +data_root_val = 'data/ActivityNet/k700slowonly' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_val.json' + +train_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', )) +] + +val_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame')) +] + +test_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame')) +] + +train_dataloader = dict( + batch_size=8, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + drop_last=True, + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +max_epochs = 9 +train_cfg = dict( + type='EpochBasedTrainLoop', + max_epochs=max_epochs, + val_begin=1, + val_interval=1) + +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='Adam', lr=0.001, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=max_epochs, + by_epoch=True, + milestones=[ + 7, + ], + gamma=0.1) +] + +work_dir = './work_dirs/bmn_400x100_2x8_9e_activitynet_feature/' +test_evaluator = dict( + type='ANetMetric', + metric_type='AR@AN', + dump_config=dict(out=f'{work_dir}/results.json', output_format='json')) +val_evaluator = test_evaluator diff --git a/configs/localization/bsn/README.md b/configs/localization/bsn/README.md index efd2d2c0d0..da52d1375d 100644 --- a/configs/localization/bsn/README.md +++ b/configs/localization/bsn/README.md @@ -23,17 +23,20 @@ Temporal action proposal generation is an important yet challenging problem, sin | feature | gpus | pretrain | AUC | AR@1 | AR@5 | AR@10 | AR@100 | gpu_mem(M) | iter time(s) | config | ckpt | log | | :-----------: | :--: | :------: | :---: | :---: | :---: | :---: | :----: | :-------------: | :----------: | :----------------------------------------: | :--------------------------------------: | :--------------------------------------: | | cuhk_mean_100 | 1 | None | 66.26 | 32.71 | 48.43 | 55.28 | 74.27 | 43(TEM)+25(PEM) | - | [config_TEM](/configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py) [config_PGM](/configs/localization/bsn/bsn_pgm_400x100_activitynet-feature.py) [config_PEM](/configs/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.py) | [ckpt_TEM](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature_20220908-9da79951.pth) [ckpt_PEM](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature_20220908-ec2eb21d.pth) | [log_tem](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.log) [log_pem](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.log) | +| slowonly-k700 | 1 | None | 67.63 | 33.04 | 48.79 | 56.01 | 75.74 | - | - | [config_TEM](/configs/localization/bsn/bsn_tem_1xb16-2048x100-20e_activitynet-k700-feature.py) [config_PGM](/configs/localization/bsn/bsn_pgm_2048x100_activitynet-slowonly-k700-feature.py) [config_PEM](/configs/localization/bsn/bsn_pem_1xb16-2048x100-20e_activitynet-slowonly-k700-feature.py) | [ckpt_TEM](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_tem_1xb16-2048x100-20e_activitynet-k700-feature_20230907-76069fda.pth) [ckpt_PEM](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_pem_1xb16-2048x100-20e_activitynet-slowonly-k700-feature_20230907-44158b6d.pth) | [log_tem](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.log) [log_pem](https://download.openmmlab.com/mmaction/v1.0/localization/bsn/bsn_pem_1xb16-400x100-20e_activitynet-feature.log) | 1. The **gpus** indicates the number of gpu we used to get the checkpoint. According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU, e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu. -2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk). +2. For feature column, cuhk_mean_100 denotes the widely used cuhk activitynet feature extracted by [anet2016-cuhk](https://github.com/yjxiong/anet2016-cuhk). The slowonly-k700 denotes the feature extracted using MMAction2's [SlowOnly model trained on Kinetics 700](/configs/recognition/slowonly/slowonly_imagenet-pretrained-r50_16xb16-8x8x1-steplr-150e_kinetics700-rgb.py). You can download this feature from [ActivityNet Data Preparation](/tools/data/activitynet/README.md). For more details on data preparation, you can refer to [ActivityNet Data Preparation](/tools/data/activitynet/README.md). ## Training and Test -The traing of the BSN model is three-stages. Firstly train the Temporal evaluation module (TEM): +The traing of the BSN model is three-stages. We take the `cuhk_mean_100` feature as an example. For `slowonly-k700` feature, just need to replace the config file with the corresponding config file with `slowonly-k700` in the file name. + +Firstly train the Temporal evaluation module (TEM): ```shell python3 tools/train.py configs/localization/bsn/bsn_tem_1xb16-400x100-20e_activitynet-feature.py diff --git a/configs/localization/bsn/bsn_pem_1xb16-2048x100-20e_activitynet-slowonly-k700-feature.py b/configs/localization/bsn/bsn_pem_1xb16-2048x100-20e_activitynet-slowonly-k700-feature.py new file mode 100644 index 0000000000..25bb7df698 --- /dev/null +++ b/configs/localization/bsn/bsn_pem_1xb16-2048x100-20e_activitynet-slowonly-k700-feature.py @@ -0,0 +1,84 @@ +_base_ = [ + '../../_base_/models/bsn_pem.py', '../../_base_/schedules/adam_20e.py', + '../../_base_/default_runtime.py' +] + +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/k700slowonly' +data_root_val = 'data/ActivityNet/k700slowonly' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_val.json' + +work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/' +pgm_proposals_dir = f'{work_dir}/pgm_proposals/' +pgm_features_dir = f'{work_dir}/pgm_features/' + +train_pipeline = [ + dict( + type='LoadProposals', + top_k=500, + pgm_proposals_dir=pgm_proposals_dir, + pgm_features_dir=pgm_features_dir), + dict( + type='PackLocalizationInputs', + keys=('reference_temporal_iou', 'bsp_feature'), + meta_keys=()) +] +val_pipeline = [ + dict( + type='LoadProposals', + top_k=1000, + pgm_proposals_dir=pgm_proposals_dir, + pgm_features_dir=pgm_features_dir), + dict( + type='PackLocalizationInputs', + keys=('tmin', 'tmax', 'tmin_score', 'tmax_score', 'bsp_feature'), + meta_keys=('video_name', 'duration_second', 'duration_frame', + 'annotations', 'feature_frame')), +] +test_pipeline = val_pipeline + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) + +val_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) + +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +train_cfg = dict(val_interval=20) + +test_evaluator = dict( + type='ANetMetric', + metric_type='AR@AN', + dump_config=dict(out=f'{work_dir}/results.json', output_format='json')) +val_evaluator = test_evaluator diff --git a/configs/localization/bsn/bsn_pgm_2048x100_activitynet-slowonly-k700-feature.py b/configs/localization/bsn/bsn_pgm_2048x100_activitynet-slowonly-k700-feature.py new file mode 100644 index 0000000000..544bc12a2e --- /dev/null +++ b/configs/localization/bsn/bsn_pgm_2048x100_activitynet-slowonly-k700-feature.py @@ -0,0 +1,32 @@ +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/k700slowonly' +data_root_val = 'data/ActivityNet/k700slowonly' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_test.json' + +work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/' +tem_results_dir = f'{work_dir}/tem_results/' +pgm_proposals_dir = f'{work_dir}/pgm_proposals/' +pgm_features_dir = f'{work_dir}/pgm_features/' + +temporal_scale = 100 +pgm_proposals_cfg = dict( + pgm_proposals_thread=8, temporal_scale=temporal_scale, peak_threshold=0.5) +pgm_features_test_cfg = dict( + pgm_features_thread=32, + top_k=1000, + num_sample_start=8, + num_sample_end=8, + num_sample_action=16, + num_sample_interp=3, + bsp_boundary_ratio=0.2) +pgm_features_train_cfg = dict( + pgm_features_thread=32, + top_k=500, + num_sample_start=8, + num_sample_end=8, + num_sample_action=16, + num_sample_interp=3, + bsp_boundary_ratio=0.2) diff --git a/configs/localization/bsn/bsn_tem_1xb16-2048x100-20e_activitynet-k700-feature.py b/configs/localization/bsn/bsn_tem_1xb16-2048x100-20e_activitynet-k700-feature.py new file mode 100644 index 0000000000..c4e5821e81 --- /dev/null +++ b/configs/localization/bsn/bsn_tem_1xb16-2048x100-20e_activitynet-k700-feature.py @@ -0,0 +1,95 @@ +_base_ = ['../../_base_/models/bsn_tem.py', '../../_base_/default_runtime.py'] + +model = dict(tem_feat_dim=2048) + +# dataset settings +dataset_type = 'ActivityNetDataset' +data_root = 'data/ActivityNet/k700slowonly' +data_root_val = 'data/ActivityNet/k700slowonly' +ann_file_train = 'data/ActivityNet/anet_anno_train.json' +ann_file_val = 'data/ActivityNet/anet_anno_val.json' +ann_file_test = 'data/ActivityNet/anet_anno_trainval.json' + +train_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', )) +] +val_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='GenerateLocalizationLabels'), + dict( + type='PackLocalizationInputs', + keys=('gt_bbox', ), + meta_keys=('video_name', )) +] +test_pipeline = [ + dict(type='LoadLocalizationFeature'), + dict(type='PackLocalizationInputs', meta_keys=('video_name', )) +] + +train_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=dict(video=data_root), + pipeline=train_pipeline)) +val_dataloader = dict( + batch_size=16, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=dict(video=data_root_val), + pipeline=val_pipeline, + test_mode=True)) +test_dataloader = dict( + batch_size=1, + num_workers=8, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=dict(video=data_root_val), + pipeline=test_pipeline, + test_mode=True)) + +train_cfg = dict( + type='EpochBasedTrainLoop', max_epochs=20, val_begin=1, val_interval=20) +val_cfg = dict(type='ValLoop') +test_cfg = dict(type='TestLoop') + +optim_wrapper = dict( + optimizer=dict(type='Adam', lr=0.001, weight_decay=0.0001), + clip_grad=dict(max_norm=40, norm_type=2)) + +param_scheduler = [ + dict( + type='MultiStepLR', + begin=0, + end=20, + by_epoch=True, + milestones=[7, 14], + gamma=0.1) +] + +work_dir = 'work_dirs/bsn_400x100_20e_1x16_activitynet_feature/' +tem_results_dir = f'{work_dir}/tem_results/' + +test_evaluator = dict( + type='ANetMetric', + metric_type='TEM', + dump_config=dict(out=tem_results_dir, output_format='csv')) +val_evaluator = test_evaluator + +default_hooks = dict(checkpoint=dict(filename_tmpl='tem_epoch_{}.pth')) diff --git a/mmaction/datasets/transforms/formatting.py b/mmaction/datasets/transforms/formatting.py index a8e9b9ab82..0ae1475c8b 100644 --- a/mmaction/datasets/transforms/formatting.py +++ b/mmaction/datasets/transforms/formatting.py @@ -145,18 +145,17 @@ def transform(self, results): for key in self.keys: if key not in results: continue - if key == 'gt_bbox': - instance_data = InstanceData() - instance_data[key] = to_tensor(results[key]) - data_sample.gt_instances = instance_data elif key == 'proposals': instance_data = InstanceData() instance_data[key] = to_tensor(results[key]) data_sample.proposals = instance_data else: - raise NotImplementedError( - f"Key '{key}' is not supported in `PackLocalizationInputs`" - ) + if hasattr(data_sample, 'gt_instances'): + data_sample.gt_instances[key] = to_tensor(results[key]) + else: + instance_data = InstanceData() + instance_data[key] = to_tensor(results[key]) + data_sample.gt_instances = instance_data img_meta = {k: results[k] for k in self.meta_keys if k in results} data_sample.set_metainfo(img_meta)