-
Notifications
You must be signed in to change notification settings - Fork 1
/
roberta.py
112 lines (89 loc) · 3.52 KB
/
roberta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pandas as pd
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification,Trainer, TrainingArguments,\
AutoTokenizer, DebertaForSequenceClassification
import torch.nn as nn
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"]="false"
# Read The data
training_set = pd.read_json('./data/train_set.json')
test_set = pd.read_json('./data/test_set.json')
max_length = 256
# load model and tokenizer and define length of the text sequence
#model = RobertaForSequenceClassification.from_pretrained('roberta-base')
#tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = max_length)
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2)
# comment line 1243
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
# define the training arguments
training_args = TrainingArguments(
output_dir = 'results_deberta',
num_train_epochs=100,
per_device_train_batch_size = 24, # 32
gradient_accumulation_steps = 4,
per_device_eval_batch_size= 16,
evaluation_strategy = "steps",
disable_tqdm = False,
load_best_model_at_end=True,
warmup_steps=1000,
weight_decay=0.01,
logging_steps = 1000,
fp16 = True,
logging_dir='logs',
save_steps=1000,
dataloader_num_workers = 8,
)
class NewsGroupsDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
item["labels"] = torch.tensor([self.labels[idx]])
return item
def __len__(self):
return len(self.labels)
if __name__ == '__main__':
train_test_split = 100 #
train_encodings = tokenizer(training_set['text'].to_list()[train_test_split:], truncation=True, padding=True,
max_length=max_length)
test_encodings = tokenizer(training_set['text'].to_list()[0:train_test_split], truncation=True, padding=True,
max_length=max_length)
train_y = training_set['label'].to_list()[train_test_split:]
test_y = training_set['label'].to_list()[0:train_test_split]
# convert our tokenized data into a torch Dataset
train_dataset = NewsGroupsDataset(train_encodings, train_y)
test_dataset = NewsGroupsDataset(test_encodings, test_y)
# instantiate the trainer class and check for available devices
trainer = Trainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=test_dataset
)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# train the model
trainer.train()
# evaluate the current model after training
trainer.evaluate()
# saving the finetuned model & tokenizer
model_path = "./model_deberta"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)