Skip to content

Commit

Permalink
Refactored SentencePieceTokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
m.habedank committed Oct 26, 2024
1 parent c27d123 commit 8e90e70
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 6 deletions.
8 changes: 3 additions & 5 deletions ludwig/utils/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,16 +1028,14 @@ def convert_token_to_id(self, token: str) -> int:


class SentencePieceTokenizer(torch.nn.Module):
def __init__(self, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
def __init__(self, **kwargs):
super().__init__()
if pretrained_model_name_or_path is None:
pretrained_model_name_or_path = "https://download.pytorch.org/models/text/xlmr.sentencepiece.bpe.model"
self.tokenizer = torchtext.transforms.SentencePieceTokenizer(sp_model_path=pretrained_model_name_or_path)
self.tokenizer = load_pretrained_hf_tokenizer("FacebookAI/xlm-roberta-base")

def forward(self, v: Union[str, List[str], torch.Tensor]):
if isinstance(v, torch.Tensor):
raise ValueError(f"Unsupported input: {v}")
return self.tokenizer(v)
return self.tokenizer.tokenize(v)


class _BPETokenizer(torch.nn.Module):
Expand Down
14 changes: 13 additions & 1 deletion tests/ludwig/utils/test_tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
import torch
import torchtext

from ludwig.utils.tokenizers import EnglishLemmatizeFilterTokenizer, NgramTokenizer, StringSplitTokenizer
from ludwig.utils.tokenizers import (
EnglishLemmatizeFilterTokenizer,
NgramTokenizer,
SentencePieceTokenizer,
StringSplitTokenizer,
)

TORCHTEXT_0_14_0_HF_NAMES = [
"bert-base-uncased",
Expand Down Expand Up @@ -85,3 +90,10 @@ def test_english_lemmatize_filter_tokenizer():
tokenizer = EnglishLemmatizeFilterTokenizer()
tokens = tokenizer(inputs)
assert len(tokens) > 0


def test_sentence_piece_tokenizer():
inputs = "This is a sentence. And this is another one."
tokenizer = SentencePieceTokenizer()
tokens = tokenizer(inputs)
assert tokens == ["▁This", "▁is", "▁a", "▁sentence", ".", "▁And", "▁this", "▁is", "▁another", "▁one", "."]

0 comments on commit 8e90e70

Please sign in to comment.