-
Notifications
You must be signed in to change notification settings - Fork 17
/
bert_embedding.py
91 lines (68 loc) · 2.77 KB
/
bert_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#! python
# -*- coding: utf-8 -*-
# Author: kun
# @Time: 2019-10-30 15:54
import torch
import torch.nn as nn
from core import text
from pytorch_pretrained_bert import BertForMaskedLM
from pytorch_pretrained_bert.modeling import BertOnlyMLMHead
class BertLikeSentencePieceTextEncoder(object):
def __init__(self, text_encoder):
if not isinstance(text_encoder, text.SubwordTextEncoder):
raise TypeError(
"`text_encoder` must be an instance of `core.text.SubwordTextEncoder`.")
self.text_encoder = text_encoder
@property
def vocab_size(self):
# +3 accounts for [CLS], [SEP] and [MASK]
return self.text_encoder.vocab_size + 3
@property
def cls_idx(self):
return self.vocab_size - 3
@property
def sep_idx(self):
return self.vocab_size - 2
@property
def mask_idx(self):
return self.vocab_size - 1
@property
def eos_idx(self):
return self.text_encoder.eos_idx
def generate_embedding(bert_model, labels):
"""Generate bert's embedding from fine-tuned model."""
batch_size, time = labels.shape
cls_ids = torch.full(
(batch_size, 1), bert_model.bert_text_encoder.cls_idx, dtype=labels.dtype, device=labels.device)
bert_labels = torch.cat([cls_ids, labels], 1)
# replace eos with sep
eos_idx = bert_model.bert_text_encoder.eos_idx
sep_idx = bert_model.bert_text_encoder.sep_idx
bert_labels[bert_labels == eos_idx] = sep_idx
embedding, _ = bert_model.bert(bert_labels, output_all_encoded_layers=True)
# sum over all layers embedding
embedding = torch.stack(embedding).sum(0)
# get rid of cls
embedding = embedding[:, 1:]
assert labels.shape == embedding.shape[:-1]
return embedding
def load_fine_tuned_model(bert_model, text_encoder, path):
"""Load fine-tuned bert model given text encoder and checkpoint path."""
bert_text_encoder = BertLikeSentencePieceTextEncoder(text_encoder)
model = BertForMaskedLM.from_pretrained(bert_model)
model.bert_text_encoder = bert_text_encoder
model.bert.embeddings.word_embeddings = nn.Embedding(
bert_text_encoder.vocab_size, model.bert.embeddings.word_embeddings.weight.shape[1])
model.config.vocab_size = bert_text_encoder.vocab_size
model.cls = BertOnlyMLMHead(
model.config, model.bert.embeddings.word_embeddings.weight)
model.load_state_dict(torch.load(path))
return model
class BertEmbeddingPredictor(nn.Module):
def __init__(self, bert_model, text_encoder, path):
super(BertEmbeddingPredictor, self).__init__()
self.model = load_fine_tuned_model(bert_model, text_encoder, path)
def forward(self, labels):
# do not modify this
self.eval()
return generate_embedding(self.model, labels)