forked from IndoNLP/nusa-crowd
-
Notifications
You must be signed in to change notification settings - Fork 0
/
filtering_test.py
51 lines (43 loc) · 1.63 KB
/
filtering_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from nusacrowd import NusantaraMetadata, NusantaraConfigHelper
from nusacrowd.utils.constants import Tasks
if __name__ == "__main__":
conhelps = NusantaraConfigHelper()
print('All Configs')
print(conhelps)
# filter and load datasets
# ====================================================================
print('Retrieve SMSA')
print([helper for helper in conhelps.filtered(lambda x: ("smsa" in x.dataset_name and x.is_nusantara_schema))])
smsa_datasets = [
helper.load_dataset()
for helper in conhelps.filtered(
lambda x: ("smsa" in x.dataset_name and x.is_nusantara_schema)
)
]
print(smsa_datasets)
# examples of other filters
# ====================================================================
# get all source schema config helpers
print('Source datasets')
source_helpers = conhelps.filtered(lambda x: x.config.schema == "source")
print(source_helpers)
# get all nusantara config helpers
print('Nusantara datasets')
nusantara_helpers = conhelps.filtered(lambda x: x.is_nusantara_schema)
print(nusantara_helpers)
# nusantara NER public tasks
print('Nusantara NER public datasets')
nc_ner_public_helpers = conhelps.filtered(
lambda x: (
x.is_nusantara_schema
and Tasks.NAMED_ENTITY_RECOGNITION in x.tasks
and not x.is_local
)
)
print(nc_ner_public_helpers)
# indolem datasets
print('IndoLEM datasets')
nc_indolem_helpers = conhelps.filtered(
lambda x: ("indolem" in x.dataset_name and x.is_nusantara_schema)
)
print(nc_indolem_helpers)