forked from EleutherAI/gpt-neox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
include_dataset.py
95 lines (86 loc) · 1.93 KB
/
include_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from datasets import get_dataset_config_names, load_dataset, load_from_disk
import re
from p3_transform import tokenize
DATASETS = {
'multiple_choice' : [
'common_sense',
'dream',
'quail',
'quartz',
'social',
'wiqa',
'cosmos',
'qasc',
'quarel',
'sciq',
'wiki_hop',
'arc',
'openbookqa',
'multirc',
'piqa',
'race_high',
# 'hellaswag',
'boolq'
],
'extractive' : [
'adversarial',
'quoref',
'duorc',
'ropes',
'squad',
'record'
],
'close' : [
'hotpot',
'wiki_qa',
'trivia_qa',
'web_questions'
],
'sentiment' : [
'amazon',
'app_reviews',
'imdb',
'rotten_tomatoes',
'yelp',
],
'summarization' : [
'cnn_dailymail',
'gigaword',
'multi_news',
'samsum',
'xsum'
],
'topic_classification' : [
'ag_news',
'dbpedia',
'trec'
],
'paraphase' : [
'mrpc',
'paws',
'qqp'
],
'structure_to_text' : [
'common_gen',
'wiki_bio'
]
}
def to_include(config):
'''Checks if a path can be included in the train/test/validation dataset'''
for section in DATASETS.values():
for pattern in section:
if(re.search(pattern,config)):
return True
return False
if __name__ == '__main__':
files = {}
for key in ['train','test','validation']:
files[key] = open(f'{key}_paths.txt','r').read().splitlines()
processed_files = {}
for key,configs in files.items():
processed_files[key] = []
for config in configs:
if(to_include(config)):
processed_files[key].append(config)
for key,configs in processed_files.items():
tokenize(configs,key)