diff --git a/docs/en/datasets/det.md b/docs/en/datasets/det.md index 5c365cffc..5c7fce4a7 100644 --- a/docs/en/datasets/det.md +++ b/docs/en/datasets/det.md @@ -31,6 +31,7 @@ | BID | [homepage](https://github.com/ricardobnjunior/Brazilian-Identity-Document-Dataset) | - | - | - | | RCTW | [homepage](https://rctw.vlrlab.net/index.html) | - | - | - | | HierText | [homepage](https://github.com/google-research-datasets/hiertext) | - | - | - | +| ArT | [homepage](https://rrc.cvc.uab.es/?ch=14) | - | - | - | ### Install AWS CLI (optional) @@ -941,3 +942,41 @@ inconsistency results in false examples in the training set. Therefore, users sh │   ├── instances_training.json │   └── instances_val.json ``` + +## ArT + +- Step1: Download `train_images.tar.gz`, and `train_labels.json` from the [homepage](https://rrc.cvc.uab.es/?ch=14&com=downloads) to `art/` + + ```bash + mkdir art && cd art + mkdir annotations + + # Download ArT dataset + wget https://dataset-bj.cdn.bcebos.com/art/train_images.tar.gz --no-check-certificate + wget https://dataset-bj.cdn.bcebos.com/art/train_labels.json --no-check-certificate + + # Extract + tar -xf train_images.tar.gz + mv train_images imgs + mv train_labels.json annotations/ + + # Remove unnecessary files + rm train_images.tar.gz + ``` + +- Step2: Generate `instances_training.json` and `instances_val.json` (optional). Since the test annotations are not publicly available, you may specify `--val-ratio` to split the dataset. E.g., if val-ratio is 0.2, then 20% of the data are left out as the validation set in this example. + + ```bash + # Annotations of ArT test split is not publicly available, split a validation set by adding --val-ratio 0.2 + python tools/data/textdet/art_converter.py PATH/TO/art --nproc 4 + ``` + +- After running the above codes, the directory structure should be as follows: + + ```text + │── art + │   ├── annotations + │   ├── imgs + │   ├── instances_training.json + │   └── instances_val.json (optional) + ``` diff --git a/docs/en/datasets/recog.md b/docs/en/datasets/recog.md index 12a35271a..728e58f19 100644 --- a/docs/en/datasets/recog.md +++ b/docs/en/datasets/recog.md @@ -35,6 +35,7 @@ | BID | [homepage](https://github.com/ricardobnjunior/Brazilian-Identity-Document-Dataset) | - | - | - | | RCTW | [homepage](https://rctw.vlrlab.net/index.html) | - | - | - | | HierText | [homepage](https://github.com/google-research-datasets/hiertext) | - | - | - | +| ArT | [homepage](https://rrc.cvc.uab.es/?ch=14) | - | - | - | (*) Since the official homepage is unavailable now, we provide an alternative for quick reference. However, we do not guarantee the correctness of the dataset. @@ -1116,3 +1117,40 @@ should be as follows: │   ├── train_label.jsonl │   └── val_label.jsonl ``` + +## ArT + +- Step1: Download `train_images.tar.gz`, and `train_labels.json` from the [homepage](https://rrc.cvc.uab.es/?ch=14&com=downloads) to `art/` + + ```bash + mkdir art && cd art + mkdir annotations + + # Download ArT dataset + wget https://dataset-bj.cdn.bcebos.com/art/train_task2_images.tar.gz + wget https://dataset-bj.cdn.bcebos.com/art/train_task2_labels.json + + # Extract + tar -xf train_task2_images.tar.gz + mv train_task2_images crops + mv train_task2_labels.json annotations/ + + # Remove unnecessary files + rm train_images.tar.gz + ``` + +- Step2: Generate `train_label.jsonl` and `val_label.jsonl` (optional). Since the test annotations are not publicly available, you may specify `--val-ratio` to split the dataset. E.g., if val-ratio is 0.2, then 20% of the data are left out as the validation set in this example. + + ```bash + # Annotations of ArT test split is not publicly available, split a validation set by adding --val-ratio 0.2 + python tools/data/textrecog/art_converter.py PATH/TO/art + ``` + +- After running the above codes, the directory structure should be as follows: + + ```text + │── art + │   ├── crops + │   ├── train_label.jsonl + │   └── val_label.jsonl (optional) + ``` diff --git a/tools/data/textdet/art_converter.py b/tools/data/textdet/art_converter.py new file mode 100644 index 000000000..950da5a9c --- /dev/null +++ b/tools/data/textdet/art_converter.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import math +import os.path as osp + +import mmcv + +from mmocr.utils import convert_annotations + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Generate training and validation set of ArT ') + parser.add_argument('root_path', help='Root dir path of ArT') + parser.add_argument( + '--val-ratio', help='Split ratio for val set', default=0.0, type=float) + args = parser.parse_args() + return args + + +def collect_art_info(root_path, split, ratio, print_every=1000): + """Collect the annotation information. + + The annotation format is as the following: + { + 'gt_1726': # 'gt_1726' is file name + [ + { + 'transcription': '燎申集团', + 'points': [ + [141, 199], + [237, 201], + [313, 236], + [357, 283], + [359, 300], + [309, 261], + [233, 230], + [140, 231] + ], + 'language': 'Chinese', + 'illegibility': False + }, + ... + ], + ... + } + + + Args: + root_path (str): Root path to the dataset + split (str): Dataset split, which should be 'train' or 'val' + ratio (float): Split ratio for val set + print_every (int): Print log info per iteration + + Returns: + img_info (dict): The dict of the img and annotation information + """ + + annotation_path = osp.join(root_path, 'annotations/train_labels.json') + if not osp.exists(annotation_path): + raise Exception( + f'{annotation_path} not exists, please check and try again.') + + annotation = mmcv.load(annotation_path) + img_prefixes = annotation.keys() + + trn_files, val_files = [], [] + if ratio > 0: + for i, file in enumerate(img_prefixes): + if i % math.floor(1 / ratio): + trn_files.append(file) + else: + val_files.append(file) + else: + trn_files, val_files = img_prefixes, [] + print(f'training #{len(trn_files)}, val #{len(val_files)}') + + if split == 'train': + img_prefixes = trn_files + elif split == 'val': + img_prefixes = val_files + else: + raise NotImplementedError + + img_infos = [] + for i, prefix in enumerate(img_prefixes): + if i > 0 and i % print_every == 0: + print(f'{i}/{len(img_prefixes)}') + img_file = osp.join(root_path, 'imgs', prefix + '.jpg') + # Skip not exist images + if not osp.exists(img_file): + continue + img = mmcv.imread(img_file) + + img_info = dict( + file_name=osp.join(osp.basename(img_file)), + height=img.shape[0], + width=img.shape[1], + segm_file=osp.join(osp.basename(annotation_path))) + + anno_info = [] + for ann in annotation[prefix]: + segmentation = [] + for x, y in ann['points']: + segmentation.append(max(0, x)) + segmentation.append(max(0, y)) + xs, ys = segmentation[::2], segmentation[1::2] + x, y = min(xs), min(ys) + w, h = max(xs) - x, max(ys) - y + bbox = [x, y, w, h] + if ann['transcription'] == '###' or ann['illegibility']: + iscrowd = 1 + else: + iscrowd = 0 + anno = dict( + iscrowd=iscrowd, + category_id=1, + bbox=bbox, + area=w * h, + segmentation=[segmentation]) + anno_info.append(anno) + img_info.update(anno_info=anno_info) + img_infos.append(img_info) + + return img_infos + + +def main(): + args = parse_args() + root_path = args.root_path + print('Processing training set...') + training_infos = collect_art_info(root_path, 'train', args.val_ratio) + convert_annotations(training_infos, + osp.join(root_path, 'instances_training.json')) + if args.val_ratio > 0: + print('Processing validation set...') + val_infos = collect_art_info(root_path, 'val', args.val_ratio) + convert_annotations(val_infos, osp.join(root_path, + 'instances_val.json')) + print('Finish') + + +if __name__ == '__main__': + main() diff --git a/tools/data/textrecog/art_converter.py b/tools/data/textrecog/art_converter.py new file mode 100644 index 000000000..8e47aca5e --- /dev/null +++ b/tools/data/textrecog/art_converter.py @@ -0,0 +1,129 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import argparse +import json +import math +import os.path as osp + +import mmcv + +from mmocr.utils.fileio import list_to_file + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Generate training and validation set of ArT ') + parser.add_argument('root_path', help='Root dir path of ArT') + parser.add_argument( + '--val-ratio', help='Split ratio for val set', default=0.0, type=float) + parser.add_argument( + '--nproc', default=1, type=int, help='Number of processes') + parser.add_argument( + '--format', + default='jsonl', + help='Use jsonl or string to format annotations', + choices=['jsonl', 'txt']) + args = parser.parse_args() + return args + + +def convert_art(root_path, split, ratio, format): + """Collect the annotation information and crop the images. + + The annotation format is as the following: + { + "gt_2836_0": [ + { + "transcription": "URDER", + "points": [ + [25, 51], + [0, 2], + [21, 0], + [42, 43] + ], + "language": "Latin", + "illegibility": false + } + ], ... + } + + + Args: + root_path (str): The root path of the dataset + split (str): The split of dataset. Namely: training or val + ratio (float): Split ratio for val set + format (str): Annotation format, whether be txt or jsonl + + Returns: + img_info (dict): The dict of the img and annotation information + """ + + annotation_path = osp.join(root_path, + 'annotations/train_task2_labels.json') + if not osp.exists(annotation_path): + raise Exception( + f'{annotation_path} not exists, please check and try again.') + + annotation = mmcv.load(annotation_path) + # outputs + dst_label_file = osp.join(root_path, f'{split}_label.{format}') + + img_prefixes = annotation.keys() + + trn_files, val_files = [], [] + if ratio > 0: + for i, file in enumerate(img_prefixes): + if i % math.floor(1 / ratio): + trn_files.append(file) + else: + val_files.append(file) + else: + trn_files, val_files = img_prefixes, [] + print(f'training #{len(trn_files)}, val #{len(val_files)}') + + if split == 'train': + img_prefixes = trn_files + elif split == 'val': + img_prefixes = val_files + else: + raise NotImplementedError + + labels = [] + for prefix in img_prefixes: + text_label = annotation[prefix][0]['transcription'] + dst_img_name = prefix + '.jpg' + + if format == 'txt': + labels.append(f'crops/{dst_img_name}' f' {text_label}') + elif format == 'jsonl': + labels.append( + json.dumps( + { + 'filename': f'crops/{dst_img_name}', + 'text': text_label + }, + ensure_ascii=False)) + + list_to_file(dst_label_file, labels) + + +def main(): + args = parse_args() + root_path = args.root_path + print('Processing training set...') + convert_art( + root_path=root_path, + split='train', + ratio=args.val_ratio, + format=args.format) + if args.val_ratio > 0: + print('Processing validation set...') + convert_art( + root_path=root_path, + split='val', + ratio=args.val_ratio, + format=args.format) + print('Finish') + + +if __name__ == '__main__': + main()