[Feature] Add ArT (#1006)

* add art * fix typo
open-mmlab · May 17, 2022 · 13986f4 · 13986f4
1 parent d9bb3d6
commit 13986f4
Show file tree

Hide file tree

Showing 4 changed files with 350 additions and 0 deletions.
diff --git a/docs/en/datasets/det.md b/docs/en/datasets/det.md
@@ -31,6 +31,7 @@
 |        BID        |                                                                                               [homepage](https://github.com/ricardobnjunior/Brazilian-Identity-Document-Dataset)                                                                                               |                                                                                                              -                                                                                                               |                                              -                                               |                                               -                                                |
 |       RCTW        |                                                                                                                 [homepage](https://rctw.vlrlab.net/index.html)                                                                                                                 |                                                                                                              -                                                                                                               |                                              -                                               |                                               -                                                |
 |     HierText      |                                                                                                        [homepage](https://github.com/google-research-datasets/hiertext)                                                                                                        |                                                                                                              -                                                                                                               |                                              -                                               |                                               -                                                |
+|       ArT        |                   [homepage](https://rrc.cvc.uab.es/?ch=14)                    |                                                                                                                                                                                                           -                                                                                                                                                                                                           |                                                             -                                                             | - |
 
 ### Install AWS CLI (optional)
 
@@ -941,3 +942,41 @@ inconsistency results in false examples in the training set. Therefore, users sh
   │   ├── instances_training.json
   │   └── instances_val.json
   ```
+
+## ArT
+
+- Step1: Download `train_images.tar.gz`, and `train_labels.json` from the [homepage](https://rrc.cvc.uab.es/?ch=14&com=downloads) to `art/`
+
+  ```bash
+  mkdir art && cd art
+  mkdir annotations
+
+  # Download ArT dataset
+  wget https://dataset-bj.cdn.bcebos.com/art/train_images.tar.gz --no-check-certificate
+  wget https://dataset-bj.cdn.bcebos.com/art/train_labels.json --no-check-certificate
+
+  # Extract
+  tar -xf train_images.tar.gz
+  mv train_images imgs
+  mv train_labels.json annotations/
+
+  # Remove unnecessary files
+  rm train_images.tar.gz
+  ```
+
+- Step2: Generate `instances_training.json` and `instances_val.json` (optional). Since the test annotations are not publicly available, you may specify `--val-ratio` to split the dataset. E.g., if val-ratio is 0.2, then 20% of the data are left out as the validation set in this example.
+
+  ```bash
+  # Annotations of ArT test split is not publicly available, split a validation set by adding --val-ratio 0.2
+  python tools/data/textdet/art_converter.py PATH/TO/art --nproc 4
+  ```
+
+- After running the above codes, the directory structure should be as follows:
+
+  ```text
+  │── art
+  │   ├── annotations
+  │   ├── imgs
+  │   ├── instances_training.json
+  │   └── instances_val.json (optional)
+  ```
diff --git a/docs/en/datasets/recog.md b/docs/en/datasets/recog.md
@@ -35,6 +35,7 @@
 |          BID          |          [homepage](https://github.com/ricardobnjunior/Brazilian-Identity-Document-Dataset)           |                                                                                                                                                                                                           -                                                                                                                                                                                                           |                                                             -                                                             | - |
 |         RCTW          |                            [homepage](https://rctw.vlrlab.net/index.html)                             |                                                                                                                                                                                                           -                                                                                                                                                                                                           |                                                             -                                                             | - |
 |       HierText        |                   [homepage](https://github.com/google-research-datasets/hiertext)                    |                                                                                                                                                                                                           -                                                                                                                                                                                                           |                                                             -                                                             | - |
+|       ArT        |                   [homepage](https://rrc.cvc.uab.es/?ch=14)                    |                                                                                                                                                                                                           -                                                                                                                                                                                                           |                                                             -                                                             | - |
 
 (*) Since the official homepage is unavailable now, we provide an alternative for quick reference. However, we do not guarantee the correctness of the dataset.
 
@@ -1116,3 +1117,40 @@ should be as follows:
   │   ├── train_label.jsonl
   │   └── val_label.jsonl
   ```
+
+## ArT
+
+- Step1: Download `train_images.tar.gz`, and `train_labels.json` from the [homepage](https://rrc.cvc.uab.es/?ch=14&com=downloads) to `art/`
+
+  ```bash
+  mkdir art && cd art
+  mkdir annotations
+
+  # Download ArT dataset
+  wget https://dataset-bj.cdn.bcebos.com/art/train_task2_images.tar.gz
+  wget https://dataset-bj.cdn.bcebos.com/art/train_task2_labels.json
+
+  # Extract
+  tar -xf train_task2_images.tar.gz
+  mv train_task2_images crops
+  mv train_task2_labels.json annotations/
+
+  # Remove unnecessary files
+  rm train_images.tar.gz
+  ```
+
+- Step2: Generate `train_label.jsonl` and `val_label.jsonl` (optional). Since the test annotations are not publicly available, you may specify `--val-ratio` to split the dataset. E.g., if val-ratio is 0.2, then 20% of the data are left out as the validation set in this example.
+
+  ```bash
+  # Annotations of ArT test split is not publicly available, split a validation set by adding --val-ratio 0.2
+  python tools/data/textrecog/art_converter.py PATH/TO/art
+  ```
+
+- After running the above codes, the directory structure should be as follows:
+
+  ```text
+  │── art
+  │   ├── crops
+  │   ├── train_label.jsonl
+  │   └── val_label.jsonl (optional)
+  ```
diff --git a/tools/data/textdet/art_converter.py b/tools/data/textdet/art_converter.py
@@ -0,0 +1,144 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import math
+import os.path as osp
+
+import mmcv
+
+from mmocr.utils import convert_annotations
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate training and validation set of ArT ')
+    parser.add_argument('root_path', help='Root dir path of ArT')
+    parser.add_argument(
+        '--val-ratio', help='Split ratio for val set', default=0.0, type=float)
+    args = parser.parse_args()
+    return args
+
+
+def collect_art_info(root_path, split, ratio, print_every=1000):
+    """Collect the annotation information.
+
+    The annotation format is as the following:
+    {
+        'gt_1726': # 'gt_1726' is file name
+        [
+            {
+                'transcription': '燎申集团',
+                'points': [
+                    [141, 199],
+                    [237, 201],
+                    [313, 236],
+                    [357, 283],
+                    [359, 300],
+                    [309, 261],
+                    [233, 230],
+                    [140, 231]
+                ],
+                'language': 'Chinese',
+                'illegibility': False
+            },
+            ...
+        ],
+        ...
+    }
+
+
+    Args:
+        root_path (str): Root path to the dataset
+        split (str): Dataset split, which should be 'train' or 'val'
+        ratio (float): Split ratio for val set
+        print_every (int): Print log info per iteration
+
+    Returns:
+        img_info (dict): The dict of the img and annotation information
+    """
+
+    annotation_path = osp.join(root_path, 'annotations/train_labels.json')
+    if not osp.exists(annotation_path):
+        raise Exception(
+            f'{annotation_path} not exists, please check and try again.')
+
+    annotation = mmcv.load(annotation_path)
+    img_prefixes = annotation.keys()
+
+    trn_files, val_files = [], []
+    if ratio > 0:
+        for i, file in enumerate(img_prefixes):
+            if i % math.floor(1 / ratio):
+                trn_files.append(file)
+            else:
+                val_files.append(file)
+    else:
+        trn_files, val_files = img_prefixes, []
+    print(f'training #{len(trn_files)}, val #{len(val_files)}')
+
+    if split == 'train':
+        img_prefixes = trn_files
+    elif split == 'val':
+        img_prefixes = val_files
+    else:
+        raise NotImplementedError
+
+    img_infos = []
+    for i, prefix in enumerate(img_prefixes):
+        if i > 0 and i % print_every == 0:
+            print(f'{i}/{len(img_prefixes)}')
+        img_file = osp.join(root_path, 'imgs', prefix + '.jpg')
+        # Skip not exist images
+        if not osp.exists(img_file):
+            continue
+        img = mmcv.imread(img_file)
+
+        img_info = dict(
+            file_name=osp.join(osp.basename(img_file)),
+            height=img.shape[0],
+            width=img.shape[1],
+            segm_file=osp.join(osp.basename(annotation_path)))
+
+        anno_info = []
+        for ann in annotation[prefix]:
+            segmentation = []
+            for x, y in ann['points']:
+                segmentation.append(max(0, x))
+                segmentation.append(max(0, y))
+            xs, ys = segmentation[::2], segmentation[1::2]
+            x, y = min(xs), min(ys)
+            w, h = max(xs) - x, max(ys) - y
+            bbox = [x, y, w, h]
+            if ann['transcription'] == '###' or ann['illegibility']:
+                iscrowd = 1
+            else:
+                iscrowd = 0
+            anno = dict(
+                iscrowd=iscrowd,
+                category_id=1,
+                bbox=bbox,
+                area=w * h,
+                segmentation=[segmentation])
+            anno_info.append(anno)
+        img_info.update(anno_info=anno_info)
+        img_infos.append(img_info)
+
+    return img_infos
+
+
+def main():
+    args = parse_args()
+    root_path = args.root_path
+    print('Processing training set...')
+    training_infos = collect_art_info(root_path, 'train', args.val_ratio)
+    convert_annotations(training_infos,
+                        osp.join(root_path, 'instances_training.json'))
+    if args.val_ratio > 0:
+        print('Processing validation set...')
+        val_infos = collect_art_info(root_path, 'val', args.val_ratio)
+        convert_annotations(val_infos, osp.join(root_path,
+                                                'instances_val.json'))
+    print('Finish')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/data/textrecog/art_converter.py b/tools/data/textrecog/art_converter.py
@@ -0,0 +1,129 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import math
+import os.path as osp
+
+import mmcv
+
+from mmocr.utils.fileio import list_to_file
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Generate training and validation set of ArT ')
+    parser.add_argument('root_path', help='Root dir path of ArT')
+    parser.add_argument(
+        '--val-ratio', help='Split ratio for val set', default=0.0, type=float)
+    parser.add_argument(
+        '--nproc', default=1, type=int, help='Number of processes')
+    parser.add_argument(
+        '--format',
+        default='jsonl',
+        help='Use jsonl or string to format annotations',
+        choices=['jsonl', 'txt'])
+    args = parser.parse_args()
+    return args
+
+
+def convert_art(root_path, split, ratio, format):
+    """Collect the annotation information and crop the images.
+
+    The annotation format is as the following:
+    {
+        "gt_2836_0": [
+            {
+                "transcription": "URDER",
+                "points": [
+                    [25, 51],
+                    [0, 2],
+                    [21, 0],
+                    [42, 43]
+                ],
+                "language": "Latin",
+                "illegibility": false
+            }
+        ], ...
+    }
+
+
+    Args:
+        root_path (str): The root path of the dataset
+        split (str): The split of dataset. Namely: training or val
+        ratio (float): Split ratio for val set
+        format (str): Annotation format, whether be txt or jsonl
+
+    Returns:
+        img_info (dict): The dict of the img and annotation information
+    """
+
+    annotation_path = osp.join(root_path,
+                               'annotations/train_task2_labels.json')
+    if not osp.exists(annotation_path):
+        raise Exception(
+            f'{annotation_path} not exists, please check and try again.')
+
+    annotation = mmcv.load(annotation_path)
+    # outputs
+    dst_label_file = osp.join(root_path, f'{split}_label.{format}')
+
+    img_prefixes = annotation.keys()
+
+    trn_files, val_files = [], []
+    if ratio > 0:
+        for i, file in enumerate(img_prefixes):
+            if i % math.floor(1 / ratio):
+                trn_files.append(file)
+            else:
+                val_files.append(file)
+    else:
+        trn_files, val_files = img_prefixes, []
+    print(f'training #{len(trn_files)}, val #{len(val_files)}')
+
+    if split == 'train':
+        img_prefixes = trn_files
+    elif split == 'val':
+        img_prefixes = val_files
+    else:
+        raise NotImplementedError
+
+    labels = []
+    for prefix in img_prefixes:
+        text_label = annotation[prefix][0]['transcription']
+        dst_img_name = prefix + '.jpg'
+
+        if format == 'txt':
+            labels.append(f'crops/{dst_img_name}' f' {text_label}')
+        elif format == 'jsonl':
+            labels.append(
+                json.dumps(
+                    {
+                        'filename': f'crops/{dst_img_name}',
+                        'text': text_label
+                    },
+                    ensure_ascii=False))
+
+    list_to_file(dst_label_file, labels)
+
+
+def main():
+    args = parse_args()
+    root_path = args.root_path
+    print('Processing training set...')
+    convert_art(
+        root_path=root_path,
+        split='train',
+        ratio=args.val_ratio,
+        format=args.format)
+    if args.val_ratio > 0:
+        print('Processing validation set...')
+        convert_art(
+            root_path=root_path,
+            split='val',
+            ratio=args.val_ratio,
+            format=args.format)
+    print('Finish')
+
+
+if __name__ == '__main__':
+    main()