-
Notifications
You must be signed in to change notification settings - Fork 3
/
eval_only.py
72 lines (58 loc) · 1.95 KB
/
eval_only.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""Eval model from huggingface on gold."""
import os
import logging
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from mbdataset import MLDatasetWithFloats
from processors import MultiLabelTSVProcessor
from mutils import write_metrics, perform_inference
model_dir = "lrei/roberta-base-emolit"
print(f"Model = {model_dir}")
output_dir = "/data"
os.makedirs(output_dir, exist_ok=True)
gold_file = "./data/emolit/gold.tsv"
SEQLEN = 40
DEVICE = 0
logger = logging.getLogger(__name__)
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
os.makedirs(output_dir, exist_ok=True)
# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model = model.to(dtype=torch.float16, device=DEVICE)
model.eval()
param_size = 0
for param in model.parameters():
param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
buffer_size += buffer.nelement() * buffer.element_size()
size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))
print('model size: {}'.format(param_size))
tokenizer = AutoTokenizer.from_pretrained(model_dir)
# Load data
id2label = model.config.id2label
print(id2label)
label2id = model.config.label2id
if label2id is None:
label2id = {v: k for k, v in id2label.items()} # type: ignore
processor_tst = MultiLabelTSVProcessor(data_file=gold_file)
le = MLDatasetWithFloats.create_label_encoder_from_id2label(
id2label=id2label
)
target_names = le.classes_.tolist()
tst_dataset = MLDatasetWithFloats(processor_tst, tokenizer, SEQLEN, le=le)
preds = perform_inference(model, tst_dataset)
labels = tst_dataset.get_label_ids()
results_file = os.path.join(output_dir, "results.txt")
write_metrics(
labels,
preds,
target_names=target_names,
output_file=results_file,
expand_neutral=None,
)