tokenizer.py

from typing import List, Optional, Tuple, Dict, Union, Any, overload, Sequence, NamedTuple
import collections
import os
import re
import unicodedata
import itertools
import requests
import copy
import json
from contextlib import contextmanager
from collections import OrderedDict, UserDict
from enum import Enum
import numpy as np
from utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
from tokenizers import AddedToken
from tokenizers import Encoding as EncodingFast


VERY_LARGE_INTEGER = int(1e30)  # This is used to set the max input length for a model with infinite size input
LARGE_INTEGER = int(1e20)  # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER

SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
FULL_TOKENIZER_FILE = "tokenizer.json"

VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt"
    }
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "bert-base-uncased": 512
}
PRETRAINED_INIT_CONFIGURATION = {
    "bert-base-uncased": {"do_lower_case": True}
}


TextInput = str
PreTokenizedInput = List[str]
EncodedInput = List[int]
TextInputPair = Tuple[str, str]
PreTokenizedInputPair = Tuple[List[str], List[str]]
EncodedInputPair = Tuple[List[int], List[int]]


class ExplicitEnum(Enum):
  @classmethod
  def _missing_(cls, value):
    raise ValueError(
      "%r is not a valid %s, please select one of %s"
      % (value, cls.__name__, str(list(cls._value2member_map_.keys())))
    )


class TruncationStrategy(ExplicitEnum):
  ONLY_FIRST = "only_first"
  ONLY_SECOND = "only_second"
  LONGEST_FIRST = "longest_first"
  DO_NOT_TRUNCATE = "do_not_truncate"


class PaddingStrategy(ExplicitEnum):
  LONGEST = "longest"
  MAX_LENGTH = "max_length"
  DO_NOT_PAD = "do_not_pad"


class TensorType(ExplicitEnum):
  PYTORCH = "pt"
  TENSORFLOW = "tf"
  NUMPY = "np"
  JAX = "jax"


class CharSpan(NamedTuple):
  start: int
  end: int


class TokenSpan(NamedTuple):
  start: int
  end: int


def to_py_obj(obj):
  """
  Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
  """
  if isinstance(obj, (dict, BatchEncoding)):
    return {k: to_py_obj(v) for k, v in obj.items()}
  elif isinstance(obj, (list, tuple)):
    return [to_py_obj(o) for o in obj]
  elif is_tf_available() and _is_tensorflow(obj):
    return obj.numpy().tolist()
  elif is_torch_available() and _is_torch(obj):
    return obj.detach().cpu().tolist()
  elif isinstance(obj, np.ndarray):
    return obj.tolist()
  else:
    return obj


def _is_torch(x):
  import torch
  return isinstance(x, torch.Tensor)


def _is_torch_device(x):
  import torch
  return isinstance(x, torch.device)


def _is_end_of_word(text):
  last_char = text[-1]
  return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))


def _is_start_of_word(text):
  first_char = text[0]
  return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))


def _is_punctuation(char):
  cp = ord(char)
  # We treat all non-letter/number ASCII as punctuation.
  # Characters such as "^", "$", and "`" are not in the Unicode
  # Punctuation class but we treat them as punctuation anyways, for
  # consistency.
  if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
    return True
  cat = unicodedata.category(char)
  if cat.startswith("P"):
    return True
  return False


def _is_whitespace(char):
  # \t, \n, and \r are technically control characters but we treat them
  # as whitespace since they are generally considered as such.
  if char == " " or char == "\t" or char == "\n" or char == "\r":
    return True
  cat = unicodedata.category(char)
  if cat == "Zs":
    return True
  return False


def _is_control(char):
  # These are technically control characters but we count them as whitespace
  # characters.
  if char == "\t" or char == "\n" or char == "\r":
    return False
  cat = unicodedata.category(char)
  if cat.startswith("C"):
    return True
  return False


def load_vocab(vocab_file):
  vocab = collections.OrderedDict()
  with open(vocab_file, "r", encoding="utf-8") as reader:
    tokens = reader.readlines()
  for index, token in enumerate(tokens):
    token = token.rstrip("\n")
    vocab[token] = index
  return vocab


def whitespace_tokenize(text):
  text = text.strip()
  if not text:
    return []
  tokens = text.split()
  return tokens


class BatchEncoding(UserDict):
  def __init__(
    self,
    data: Optional[Dict[str, Any]] = None,
    encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
    tensor_type: Union[None, str, TensorType] = None,
    prepend_batch_axis: bool = False,
    n_sequences: Optional[int] = None,
  ):
    super().__init__(data)

    if isinstance(encoding, EncodingFast):
      encoding = [encoding]

    self._encodings = encoding

    if n_sequences is None and encoding is not None and len(encoding):
      n_sequences = encoding[0].n_sequences

    self._n_sequences = n_sequences

    self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

  @property
  def n_sequences(self) -> Optional[int]:
    return self._n_sequences

  @property
  def is_fast(self) -> bool:
    return self._encodings is not None

  def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
    if isinstance(item, str):
      return self.data[item]
    elif self._encodings is not None:
      return self._encodings[item]
    else:
      raise KeyError(
        "Indexing with integers (to access backend Encoding for a given batch index) "
        "is not available when using Python based tokenizers"
      )

  def __getattr__(self, item: str):
    try:
      return self.data[item]
    except KeyError:
      raise AttributeError

  def __getstate__(self):
    return {"data": self.data, "encodings": self._encodings}

  def __setstate__(self, state):
    if "data" in state:
      self.data = state["data"]

    if "encodings" in state:
      self._encodings = state["encodings"]

  def keys(self):
    return self.data.keys()

  def values(self):
    return self.data.values()

  def items(self):
    return self.data.items()

  # After this point:
  # Extended properties and methods only available for fast (Rust-based) tokenizers
  # provided by HuggingFace tokenizers library.

  @property
  def encodings(self) -> Optional[List[EncodingFast]]:
    return self._encodings

  def tokens(self, batch_index: int = 0) -> List[str]:
    if not self._encodings:
      raise ValueError("tokens() is not available when using Python-based tokenizers")
    return self._encodings[batch_index].tokens

  def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
    if not self._encodings:
      raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
    return self._encodings[batch_index].sequence_ids

  def words(self, batch_index: int = 0) -> List[Optional[int]]:
    if not self._encodings:
      raise ValueError("words() is not available when using Python-based tokenizers")
    return self.word_ids(batch_index)

  def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
    if not self._encodings:
      raise ValueError("word_ids() is not available when using Python-based tokenizers")
    return self._encodings[batch_index].word_ids

  def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
    if not self._encodings:
      raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
    if token_index is not None:
      batch_index = batch_or_token_index
    else:
      batch_index = 0
      token_index = batch_or_token_index
    if batch_index < 0:
      batch_index = self._batch_size + batch_index
    if token_index < 0:
      token_index = self._seq_len + token_index
    return self._encodings[batch_index].token_to_sequence(token_index)

  def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
    if not self._encodings:
      raise ValueError("token_to_word() is not available when using Python based tokenizers")
    if token_index is not None:
      batch_index = batch_or_token_index
    else:
      batch_index = 0
      token_index = batch_or_token_index
    if batch_index < 0:
      batch_index = self._batch_size + batch_index
    if token_index < 0:
      token_index = self._seq_len + token_index
    return self._encodings[batch_index].token_to_word(token_index)

  def word_to_tokens(
    self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
  ) -> Optional[TokenSpan]:
    if not self._encodings:
      raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
    if word_index is not None:
      batch_index = batch_or_word_index
    else:
      batch_index = 0
      word_index = batch_or_word_index
    if batch_index < 0:
      batch_index = self._batch_size + batch_index
    if word_index < 0:
      word_index = self._seq_len + word_index
    span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
    return TokenSpan(*span) if span is not None else None

  def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
    if not self._encodings:
      raise ValueError("token_to_chars() is not available when using Python based tokenizers")
    if token_index is not None:
      batch_index = batch_or_token_index
    else:
      batch_index = 0
      token_index = batch_or_token_index
    return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))

  def char_to_token(
    self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
  ) -> int:
    if not self._encodings:
      raise ValueError("char_to_token() is not available when using Python based tokenizers")
    if char_index is not None:
      batch_index = batch_or_char_index
    else:
      batch_index = 0
      char_index = batch_or_char_index
    return self._encodings[batch_index].char_to_token(char_index, sequence_index)

  def word_to_chars(
    self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
  ) -> CharSpan:
    if not self._encodings:
      raise ValueError("word_to_chars() is not available when using Python based tokenizers")
    if word_index is not None:
      batch_index = batch_or_word_index
    else:
      batch_index = 0
      word_index = batch_or_word_index
    return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))

  def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
    if not self._encodings:
      raise ValueError("char_to_word() is not available when using Python based tokenizers")
    if char_index is not None:
      batch_index = batch_or_char_index
    else:
      batch_index = 0
      char_index = batch_or_char_index
    return self._encodings[batch_index].char_to_word(char_index, sequence_index)

  def convert_to_tensors(
    self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
  ):
    if tensor_type is None:
      return self

    # Convert to TensorType
    if not isinstance(tensor_type, TensorType):
      tensor_type = TensorType(tensor_type)

    # Get a function reference for the correct framework
    if tensor_type == TensorType.TENSORFLOW:
      if not is_tf_available():
        raise ImportError(
          "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
        )
      import tensorflow as tf

      as_tensor = tf.constant
      is_tensor = tf.is_tensor
    elif tensor_type == TensorType.PYTORCH:
      if not is_torch_available():
        raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
      import torch

      as_tensor = torch.tensor
      is_tensor = torch.is_tensor
    elif tensor_type == TensorType.JAX:
      if not is_flax_available():
        raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
      import jax.numpy as jnp  # noqa: F811

      as_tensor = jnp.array
      is_tensor = _is_jax
    else:
      as_tensor = np.asarray
      is_tensor = _is_numpy
    # (mfuntowicz: This code is unreachable)
    # else:
    #     raise ImportError(
    #         "Unable to convert output to tensors format {}".format(tensor_type)
    #     )

    # Do the tensor conversion in batch
    for key, value in self.items():
      try:
        if prepend_batch_axis:
          value = [value]

        if not is_tensor(value):
          tensor = as_tensor(value)

          # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
          # # at-least2d
          # if tensor.ndim > 2:
          #     tensor = tensor.squeeze(0)
          # elif tensor.ndim < 2:
          #     tensor = tensor[None, :]

          self[key] = tensor
      except:  # noqa E722
        if key == "overflowing_tokens":
          raise ValueError(
            "Unable to create tensor returning overflowing tokens of different lengths. "
            "Please see if a fast version of this tokenizer is available to have this feature available."
          )
        raise ValueError(
          "Unable to create tensor, you should probably activate truncation and/or padding "
          "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
        )

    return self

  def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
    # This check catches things like APEX blindly calling "to" on all inputs to a module
    # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
    # into a HalfTensor
    if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
      self.data = {k: v.to(device=device) for k, v in self.data.items()}
    return self


class SpecialTokensMixin:
  SPECIAL_TOKENS_ATTRIBUTES = [
    "bos_token",
    "eos_token",
    "unk_token",
    "sep_token",
    "pad_token",
    "cls_token",
    "mask_token",
    "additional_special_tokens",
  ]

  def __init__(self, verbose=True, **kwargs):
    self._bos_token = None
    self._eos_token = None
    self._unk_token = None
    self._sep_token = None
    self._pad_token = None
    self._cls_token = None
    self._mask_token = None
    self._pad_token_type_id = 0
    self._additional_special_tokens = []
    self.verbose = verbose

    # We directly set the hidden value to allow initialization with special tokens
    # which are not yet in the vocabulary. Necessary for serialization/de-serialization
    # TODO clean this up at some point (probably by switching to fast tokenizers)
    for key, value in kwargs.items():
      if value is None:
        continue
      if key in self.SPECIAL_TOKENS_ATTRIBUTES:
        if key == "additional_special_tokens":
          assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
          assert all(isinstance(t, str) for t in value), "One of the tokens is not a string"
          setattr(self, key, value)
        elif isinstance(value, (str, AddedToken)):
          setattr(self, key, value)
        else:
          raise TypeError(
            "special token {} has to be either str or AddedToken but got: {}".format(key, type(value))
          )

  def sanitize_special_tokens(self) -> int:
    return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)

  def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
    if not special_tokens_dict:
      return 0

    added_tokens = 0
    for key, value in special_tokens_dict.items():
      assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"

      setattr(self, key, value)

      if key == "additional_special_tokens":
        assert isinstance(value, (list, tuple)) and all(
          isinstance(t, (str, AddedToken)) for t in value
        ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
        added_tokens += self.add_tokens(value, special_tokens=True)
      else:
        assert isinstance(
          value, (str, AddedToken)
        ), f"Token {value} for key {key} should be a str or an AddedToken instance"
        added_tokens += self.add_tokens([value], special_tokens=True)

    return added_tokens

  def add_tokens(
    self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
  ) -> int:
    if not new_tokens:
      return 0

    if not isinstance(new_tokens, (list, tuple)):
      new_tokens = [new_tokens]

    return self._add_tokens(new_tokens, special_tokens=special_tokens)

  def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
    raise NotImplementedError

  @property
  def bos_token(self) -> str:
    if self._bos_token is None and self.verbose:
      return None
    return str(self._bos_token)

  @property
  def eos_token(self) -> str:
    if self._eos_token is None and self.verbose:
      return None
    return str(self._eos_token)

  @property
  def unk_token(self) -> str:
    if self._unk_token is None and self.verbose:
      return None
    return str(self._unk_token)

  @property
  def sep_token(self) -> str:
    if self._sep_token is None and self.verbose:
      return None
    return str(self._sep_token)

  @property
  def pad_token(self) -> str:
    if self._pad_token is None and self.verbose:
      return None
    return str(self._pad_token)

  @property
  def cls_token(self) -> str:
    if self._cls_token is None and self.verbose:
      return None
    return str(self._cls_token)

  @property
  def mask_token(self) -> str:
    if self._mask_token is None and self.verbose:
      return None
    return str(self._mask_token)

  @property
  def additional_special_tokens(self) -> List[str]:
    if self._additional_special_tokens is None and self.verbose:
      return None
    return [str(tok) for tok in self._additional_special_tokens]

  @bos_token.setter
  def bos_token(self, value):
    self._bos_token = value

  @eos_token.setter
  def eos_token(self, value):
    self._eos_token = value

  @unk_token.setter
  def unk_token(self, value):
    self._unk_token = value

  @sep_token.setter
  def sep_token(self, value):
    self._sep_token = value

  @pad_token.setter
  def pad_token(self, value):
    self._pad_token = value

  @cls_token.setter
  def cls_token(self, value):
    self._cls_token = value

  @mask_token.setter
  def mask_token(self, value):
    self._mask_token = value

  @additional_special_tokens.setter
  def additional_special_tokens(self, value):
    self._additional_special_tokens = value

  @property
  def bos_token_id(self) -> Optional[int]:
    if self._bos_token is None:
      return None
    return self.convert_tokens_to_ids(self.bos_token)

  @property
  def eos_token_id(self) -> Optional[int]:
    if self._eos_token is None:
      return None
    return self.convert_tokens_to_ids(self.eos_token)

  @property
  def unk_token_id(self) -> Optional[int]:
    if self._unk_token is None:
      return None
    return self.convert_tokens_to_ids(self.unk_token)

  @property
  def sep_token_id(self) -> Optional[int]:
    if self._sep_token is None:
      return None
    return self.convert_tokens_to_ids(self.sep_token)

  @property
  def pad_token_id(self) -> Optional[int]:
    if self._pad_token is None:
      return None
    return self.convert_tokens_to_ids(self.pad_token)

  @property
  def pad_token_type_id(self) -> int:
    return self._pad_token_type_id

  @property
  def cls_token_id(self) -> Optional[int]:
    if self._cls_token is None:
      return None
    return self.convert_tokens_to_ids(self.cls_token)

  @property
  def mask_token_id(self) -> Optional[int]:
    if self._mask_token is None:
      return None
    return self.convert_tokens_to_ids(self.mask_token)

  @property
  def additional_special_tokens_ids(self) -> List[int]:
    return self.convert_tokens_to_ids(self.additional_special_tokens)

  @bos_token_id.setter
  def bos_token_id(self, value):
    self._bos_token = self.convert_tokens_to_ids(value)

  @eos_token_id.setter
  def eos_token_id(self, value):
    self._eos_token = self.convert_tokens_to_ids(value)

  @unk_token_id.setter
  def unk_token_id(self, value):
    self._unk_token = self.convert_tokens_to_ids(value)

  @sep_token_id.setter
  def sep_token_id(self, value):
    self._sep_token = self.convert_tokens_to_ids(value)

  @pad_token_id.setter
  def pad_token_id(self, value):
    self._pad_token = self.convert_tokens_to_ids(value)

  @cls_token_id.setter
  def cls_token_id(self, value):
    self._cls_token = self.convert_tokens_to_ids(value)

  @mask_token_id.setter
  def mask_token_id(self, value):
    self._mask_token = self.convert_tokens_to_ids(value)

  @additional_special_tokens_ids.setter
  def additional_special_tokens_ids(self, values):
    self._additional_special_tokens = [self.convert_tokens_to_ids(value) for value in values]

  @property
  def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
    set_attr = {}
    for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
      attr_value = getattr(self, "_" + attr)
      if attr_value:
        set_attr[attr] = str(attr_value)
    return set_attr

  @property
  def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
    set_attr = {}
    for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
      attr_value = getattr(self, "_" + attr)
      if attr_value:
        set_attr[attr] = attr_value
    return set_attr

  @property
  def all_special_tokens(self) -> List[str]:
    all_toks = [str(s) for s in self.all_special_tokens_extended]
    return all_toks

  @property
  def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
    all_toks = []
    set_attr = self.special_tokens_map_extended
    for attr_value in set_attr.values():
      all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
    all_toks = list(OrderedDict.fromkeys(all_toks))
    return all_toks

  @property
  def all_special_ids(self) -> List[int]:
    all_toks = self.all_special_tokens
    all_ids = self.convert_tokens_to_ids(all_toks)
    return all_ids


class PreTrainedTokenizerBase(SpecialTokensMixin):
  vocab_files_names: Dict[str, str] = {}
  pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
  pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
  max_model_input_sizes: Dict[str, Optional[int]] = {}

  # first name has to correspond to main model input name
  # to make sure `tokenizer.pad(...)` works correctly
  model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"]
  padding_side: str = "right"
  slow_tokenizer_class = None

  def __init__(self, **kwargs):
    # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
    self.init_inputs = ()
    self.init_kwargs = copy.deepcopy(kwargs)
    self.name_or_path = kwargs.pop("name_or_path", "")

    # For backward compatibility we fallback to set model_max_length from max_len if provided
    model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
    self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER

    # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed.
    self.padding_side = kwargs.pop("padding_side", self.padding_side)
    assert self.padding_side in [
      "right",
      "left",
    ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
    self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)

    self.deprecation_warnings = (
      {}
    )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).

    super().__init__(**kwargs)

  @property
  def max_len_single_sentence(self) -> int:
    return self.model_max_length - self.num_special_tokens_to_add(pair=False)

  @property
  def max_len_sentences_pair(self) -> int:
    return self.model_max_length - self.num_special_tokens_to_add(pair=True)

  @max_len_single_sentence.setter
  def max_len_single_sentence(self, value) -> int:
    # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
    if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
      self.deprecation_warnings["max_len_single_sentence"] = True
    else:
      raise ValueError(
        "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
      )

  @max_len_sentences_pair.setter
  def max_len_sentences_pair(self, value) -> int:
    # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
    if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
      self.deprecation_warnings["max_len_sentences_pair"] = True
    else:
      raise ValueError(
        "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
      )

  def __repr__(self) -> str:
    return (
      f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
      f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
      f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
    )

  def get_vocab(self) -> Dict[str, int]:
    raise NotImplementedError()

  @classmethod
  def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
    cache_dir = kwargs.pop("cache_dir", None)
    force_download = kwargs.pop("force_download", False)
    resume_download = kwargs.pop("resume_download", False)
    proxies = kwargs.pop("proxies", None)
    local_files_only = kwargs.pop("local_files_only", False)
    use_auth_token = kwargs.pop("use_auth_token", None)
    revision = kwargs.pop("revision", None)
    subfolder = kwargs.pop("subfolder", None)

    s3_models = list(cls.max_model_input_sizes.keys())
    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
    vocab_files = {}
    init_configuration = {}
    if pretrained_model_name_or_path in s3_models:
      # Get the vocabulary from AWS S3 bucket
      for file_id, map_list in cls.pretrained_vocab_files_map.items():
        vocab_files[file_id] = map_list[pretrained_model_name_or_path]
      if (
        cls.pretrained_init_configuration
        and pretrained_model_name_or_path in cls.pretrained_init_configuration
      ):
        init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
    else:
      # Get the vocabulary from local files
      if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
        if len(cls.vocab_files_names) > 1:
          raise ValueError(
            "Calling {}.from_pretrained() with the path to a single file or url is not supported."
            "Use a model identifier or the path to a directory instead.".format(cls.__name__)
          )
        file_id = list(cls.vocab_files_names.keys())[0]
        vocab_files[file_id] = pretrained_model_name_or_path
      else:
        # At this point pretrained_model_name_or_path is either a directory or a model identifier name
        additional_files_names = {
          "added_tokens_file": ADDED_TOKENS_FILE,
          "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
          "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
          "tokenizer_file": FULL_TOKENIZER_FILE,
        }
        # Look for the tokenizer files
        for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
          if os.path.isdir(pretrained_model_name_or_path):
            if subfolder is not None:
              full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
            else:
              full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
            if not os.path.exists(full_file_name):
              full_file_name = None
          else:
            full_file_name = hf_bucket_url(
              pretrained_model_name_or_path,
              filename=file_name,
              subfolder=subfolder,
              revision=revision,
              mirror=None,
            )

          vocab_files[file_id] = full_file_name

    # Get files from url, cache, or disk depending on the case
    resolved_vocab_files = {}
    unresolved_files = []
    for file_id, file_path in vocab_files.items():
      if file_path is None:
        resolved_vocab_files[file_id] = None
      else:
        try:
          try:
            resolved_vocab_files[file_id] = cached_path(
              file_path,
              cache_dir=cache_dir,
              force_download=force_download,
              proxies=proxies,
              resume_download=resume_download,
              local_files_only=local_files_only,
              use_auth_token=use_auth_token,
            )
          except FileNotFoundError as error:
            if local_files_only:
              unresolved_files.append(file_id)
            else:
              raise error

        except requests.exceptions.HTTPError as err:
          if "404 Client Error" in str(err):
            resolved_vocab_files[file_id] = None
          else:
            raise err

    if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
      msg = (
        f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
        f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
        f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing relevant tokenizer files\n\n"
      )
      raise EnvironmentError(msg)

    for file_id, file_path in vocab_files.items():
      if file_id not in resolved_vocab_files:
        continue

    return cls._from_pretrained(
      resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
    )

  @classmethod
  def _from_pretrained(
    cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
  ):
    # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
    # file or if `from_slow` is set to True.
    from_slow = kwargs.get("from_slow", False)
    has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
    if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
      slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
        copy.deepcopy(resolved_vocab_files),
        pretrained_model_name_or_path,
        copy.deepcopy(init_configuration),
        *init_inputs,
        **(copy.deepcopy(kwargs)),
      )
    else:
      slow_tokenizer = None

    # Prepare tokenizer initialization kwargs
    # Did we saved some inputs and kwargs to reload ?
    tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
    if tokenizer_config_file is not None:
      with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
        init_kwargs = json.load(tokenizer_config_handle)
      saved_init_inputs = init_kwargs.pop("init_inputs", ())
      if not init_inputs:
        init_inputs = saved_init_inputs
    else:
      init_kwargs = init_configuration

    # Update with newly provided kwargs
    init_kwargs.update(kwargs)

    # Convert AddedTokens serialized as dict to class instances
    def convert_added_tokens(obj: Union[AddedToken, Any]):
      if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
        obj.pop("__type")
        return AddedToken(**obj)
      elif isinstance(obj, (list, tuple)):
        return list(convert_added_tokens(o) for o in obj)
      elif isinstance(obj, dict):
        return {k: convert_added_tokens(v) for k, v in obj.items()}
      return obj

    init_kwargs = convert_added_tokens(init_kwargs)

    # Set max length if needed
    if pretrained_model_name_or_path in cls.max_model_input_sizes:
      # if we're using a pretrained model, ensure the tokenizer
      # wont index sequences longer than the number of positional embeddings
      model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
      if model_max_length is not None and isinstance(model_max_length, (int, float)):
        init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)

    # Merge resolved_vocab_files arguments in init_kwargs.
    added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
    for args_name, file_path in resolved_vocab_files.items():
      if args_name not in init_kwargs:
        init_kwargs[args_name] = file_path

    if slow_tokenizer is not None:
      init_kwargs["__slow_tokenizer"] = slow_tokenizer

    init_kwargs["name_or_path"] = pretrained_model_name_or_path

    # Instantiate tokenizer.
    try:
      tokenizer = cls(*init_inputs, **init_kwargs)
    except OSError:
      raise OSError(
        "Unable to load vocabulary from file. "
        "Please check that the provided vocabulary is accessible and not corrupted."
      )

    # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
    # Removed: Now done at the base class level
    # tokenizer.init_inputs = init_inputs
    # tokenizer.init_kwargs = init_kwargs

    # If there is a complementary special token map, load it
    special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
    if special_tokens_map_file is not None:
      with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
        special_tokens_map = json.load(special_tokens_map_handle)
      for key, value in special_tokens_map.items():
        if isinstance(value, dict):
          value = AddedToken(**value)
        elif isinstance(value, list):
          value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
        setattr(tokenizer, key, value)

    # Add supplementary tokens.
    special_tokens = tokenizer.all_special_tokens
    if added_tokens_file is not None:
      with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
        added_tok_encoder = json.load(added_tokens_handle)

      # Sort added tokens by index
      added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))

      for token, index in added_tok_encoder_sorted:
        assert index == len(tokenizer), (
          f"Non-consecutive added token '{token}' found. "
          f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
        )
        tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))

    # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
    added_tokens = tokenizer.sanitize_special_tokens()

    return tokenizer

  def save_pretrained(
    self,
    save_directory: Union[str, os.PathLike],
    legacy_format: bool = True,
    filename_prefix: Optional[str] = None,
  ) -> Tuple[str]:
    if os.path.isfile(save_directory):
      return
    os.makedirs(save_directory, exist_ok=True)

    special_tokens_map_file = os.path.join(
      save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
    )
    tokenizer_config_file = os.path.join(
      save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
    )

    tokenizer_config = copy.deepcopy(self.init_kwargs)
    if len(self.init_inputs) > 0:
      tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
    for file_id in self.vocab_files_names.keys():
      tokenizer_config.pop(file_id, None)

    # Sanitize AddedTokens
    def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
      if isinstance(obj, AddedToken):
        out = obj.__getstate__()
        if add_type_field:
          out["__type"] = "AddedToken"
        return out
      elif isinstance(obj, (list, tuple)):
        return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
      elif isinstance(obj, dict):
        return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
      return obj

    # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
    tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
    with open(tokenizer_config_file, "w", encoding="utf-8") as f:
      f.write(json.dumps(tokenizer_config, ensure_ascii=False))

    # Sanitize AddedTokens in special_tokens_map
    write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
    with open(special_tokens_map_file, "w", encoding="utf-8") as f:
      f.write(json.dumps(write_dict, ensure_ascii=False))

    file_names = (tokenizer_config_file, special_tokens_map_file)

    return self._save_pretrained(
      save_directory=save_directory,
      file_names=file_names,
      legacy_format=legacy_format,
      filename_prefix=filename_prefix,
    )

  def _save_pretrained(
    self,
    save_directory: Union[str, os.PathLike],
    file_names: Tuple[str],
    legacy_format: bool = True,
    filename_prefix: Optional[str] = None,
  ) -> Tuple[str]:
    if not legacy_format:
      raise ValueError(
        "Only fast tokenizers (instances of PretrainedTokenizerFast) can be saved in non legacy format."
      )

    save_directory = str(save_directory)

    added_tokens_file = os.path.join(
      save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
    )
    added_vocab = self.get_added_vocab()
    if added_vocab:
      with open(added_tokens_file, "w", encoding="utf-8") as f:
        out_str = json.dumps(added_vocab, ensure_ascii=False)
        f.write(out_str)

    vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)

    return file_names + vocab_files + (added_tokens_file,)

  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
    raise NotImplementedError

  def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
    raise NotImplementedError

  def encode(
    self,
    text: Union[TextInput, PreTokenizedInput, EncodedInput],
    text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
    add_special_tokens: bool = True,
    padding: Union[bool, str, PaddingStrategy] = False,
    truncation: Union[bool, str, TruncationStrategy] = False,
    max_length: Optional[int] = None,
    stride: int = 0,
    return_tensors: Optional[Union[str, TensorType]] = None,
    **kwargs
  ) -> List[int]:
    encoded_inputs = self.encode_plus(
      text,
      text_pair=text_pair,
      add_special_tokens=add_special_tokens,
      padding=padding,
      truncation=truncation,
      max_length=max_length,
      stride=stride,
      return_tensors=return_tensors,
      **kwargs,
    )

    return encoded_inputs["input_ids"]

  def num_special_tokens_to_add(self, pair: bool = False) -> int:
    raise NotImplementedError

  def _get_padding_truncation_strategies(
    self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
  ):
    old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
    old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)

    # Backward compatibility for previous behavior, maybe we should deprecate it:
    # If you only set max_length, it activates truncation for max_length
    if max_length is not None and padding is False and truncation is False:
      if verbose:
        self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
      truncation = "longest_first"

    # Get padding strategy
    if padding is False and old_pad_to_max_length:
      if max_length is None:
        padding_strategy = PaddingStrategy.LONGEST
      else:
        padding_strategy = PaddingStrategy.MAX_LENGTH
    elif padding is not False:
      if padding is True:
        padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
      elif not isinstance(padding, PaddingStrategy):
        padding_strategy = PaddingStrategy(padding)
      elif isinstance(padding, PaddingStrategy):
        padding_strategy = padding
    else:
      padding_strategy = PaddingStrategy.DO_NOT_PAD

    # Get truncation strategy
    if truncation is False and old_truncation_strategy != "do_not_truncate":
      truncation_strategy = TruncationStrategy(old_truncation_strategy)
    elif truncation is not False:
      if truncation is True:
        truncation_strategy = (
          TruncationStrategy.LONGEST_FIRST
        )  # Default to truncate the longest sequences in pairs of inputs
      elif not isinstance(truncation, TruncationStrategy):
        truncation_strategy = TruncationStrategy(truncation)
      elif isinstance(truncation, TruncationStrategy):
        truncation_strategy = truncation
    else:
      truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

    # Set max length if needed
    if max_length is None:
      if padding_strategy == PaddingStrategy.MAX_LENGTH:
        if self.model_max_length > LARGE_INTEGER:
          if verbose:
            self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
          padding_strategy = PaddingStrategy.DO_NOT_PAD
        else:
          max_length = self.model_max_length

      if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
        if self.model_max_length > LARGE_INTEGER:
          if verbose:
            self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
          truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
        else:
          max_length = self.model_max_length

    # Test if we have a padding token
    if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
      raise ValueError(
        "Asking to pad but the tokenizer does not have a padding token. "
        "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
        "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
      )

    # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
    if (
      truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
      and padding_strategy != PaddingStrategy.DO_NOT_PAD
      and pad_to_multiple_of is not None
      and max_length is not None
      and (max_length % pad_to_multiple_of != 0)
    ):
      raise ValueError(
        f"Truncation and padding are both activated but "
        f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
      )

    return padding_strategy, truncation_strategy, max_length, kwargs

  def __call__(
    self,
    text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
    text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
    add_special_tokens: bool = True,
    padding: Union[bool, str, PaddingStrategy] = False,
    truncation: Union[bool, str, TruncationStrategy] = False,
    max_length: Optional[int] = None,
    stride: int = 0,
    is_split_into_words: bool = False,
    pad_to_multiple_of: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    return_token_type_ids: Optional[bool] = None,
    return_attention_mask: Optional[bool] = None,
    return_overflowing_tokens: bool = False,
    return_special_tokens_mask: bool = False,
    return_offsets_mapping: bool = False,
    return_length: bool = False,
    verbose: bool = True,
    **kwargs
  ) -> BatchEncoding:
    # Input type checking for clearer error
    assert isinstance(text, str) or (
      isinstance(text, (list, tuple))
      and (
        len(text) == 0
        or (
          isinstance(text[0], str)
          or (isinstance(text[0], (list, tuple)) and (len(text[0]) == 0 or isinstance(text[0][0], str)))
        )
      )
    ), (
      "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
      "or `List[List[str]]` (batch of pretokenized examples)."
    )

    assert (
      text_pair is None
      or isinstance(text_pair, str)
      or (
        isinstance(text_pair, (list, tuple))
        and (
          len(text_pair) == 0
          or (
            isinstance(text_pair[0], str)
            or (
              isinstance(text_pair[0], (list, tuple))
              and (len(text_pair[0]) == 0 or isinstance(text_pair[0][0], str))
            )
          )
        )
      )
    ), (
      "text_pair input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
      "or `List[List[str]]` (batch of pretokenized examples)."
    )

    is_batched = bool(
      (not is_split_into_words and isinstance(text, (list, tuple)))
      or (
        is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
      )
    )

    if is_batched:
      batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
      return self.batch_encode_plus(
        batch_text_or_text_pairs=batch_text_or_text_pairs,
        add_special_tokens=add_special_tokens,
        padding=padding,
        truncation=truncation,
        max_length=max_length,
        stride=stride,
        is_split_into_words=is_split_into_words,
        pad_to_multiple_of=pad_to_multiple_of,
        return_tensors=return_tensors,
        return_token_type_ids=return_token_type_ids,
        return_attention_mask=return_attention_mask,
        return_overflowing_tokens=return_overflowing_tokens,
        return_special_tokens_mask=return_special_tokens_mask,
        return_offsets_mapping=return_offsets_mapping,
        return_length=return_length,
        verbose=verbose,
        **kwargs,
      )
    else:
      return self.encode_plus(
        text=text,
        text_pair=text_pair,
        add_special_tokens=add_special_tokens,
        padding=padding,
        truncation=truncation,
        max_length=max_length,
        stride=stride,
        is_split_into_words=is_split_into_words,
        pad_to_multiple_of=pad_to_multiple_of,
        return_tensors=return_tensors,
        return_token_type_ids=return_token_type_ids,
        return_attention_mask=return_attention_mask,
        return_overflowing_tokens=return_overflowing_tokens,
        return_special_tokens_mask=return_special_tokens_mask,
        return_offsets_mapping=return_offsets_mapping,
        return_length=return_length,
        verbose=verbose,
        **kwargs,
      )

  def encode_plus(
    self,
    text: Union[TextInput, PreTokenizedInput, EncodedInput],
    text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
    add_special_tokens: bool = True,
    padding: Union[bool, str, PaddingStrategy] = False,
    truncation: Union[bool, str, TruncationStrategy] = False,
    max_length: Optional[int] = None,
    stride: int = 0,
    is_split_into_words: bool = False,
    pad_to_multiple_of: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    return_token_type_ids: Optional[bool] = None,
    return_attention_mask: Optional[bool] = None,
    return_overflowing_tokens: bool = False,
    return_special_tokens_mask: bool = False,
    return_offsets_mapping: bool = False,
    return_length: bool = False,
    verbose: bool = True,
    **kwargs
  ) -> BatchEncoding:
    # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
    padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
      padding=padding,
      truncation=truncation,
      max_length=max_length,
      pad_to_multiple_of=pad_to_multiple_of,
      verbose=verbose,
      **kwargs,
    )

    return self._encode_plus(
      text=text,
      text_pair=text_pair,
      add_special_tokens=add_special_tokens,
      padding_strategy=padding_strategy,
      truncation_strategy=truncation_strategy,
      max_length=max_length,
      stride=stride,
      is_split_into_words=is_split_into_words,
      pad_to_multiple_of=pad_to_multiple_of,
      return_tensors=return_tensors,
      return_token_type_ids=return_token_type_ids,
      return_attention_mask=return_attention_mask,
      return_overflowing_tokens=return_overflowing_tokens,
      return_special_tokens_mask=return_special_tokens_mask,
      return_offsets_mapping=return_offsets_mapping,
      return_length=return_length,
      verbose=verbose,
      **kwargs,
    )

  def _encode_plus(
    self,
    text: Union[TextInput, PreTokenizedInput, EncodedInput],
    text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
    add_special_tokens: bool = True,
    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
    truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
    max_length: Optional[int] = None,
    stride: int = 0,
    is_split_into_words: bool = False,
    pad_to_multiple_of: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    return_token_type_ids: Optional[bool] = None,
    return_attention_mask: Optional[bool] = None,
    return_overflowing_tokens: bool = False,
    return_special_tokens_mask: bool = False,
    return_offsets_mapping: bool = False,
    return_length: bool = False,
    verbose: bool = True,
    **kwargs
  ) -> BatchEncoding:
    raise NotImplementedError

  def batch_encode_plus(
    self,
    batch_text_or_text_pairs: Union[
      List[TextInput],
      List[TextInputPair],
      List[PreTokenizedInput],
      List[PreTokenizedInputPair],
      List[EncodedInput],
      List[EncodedInputPair],
    ],
    add_special_tokens: bool = True,
    padding: Union[bool, str, PaddingStrategy] = False,
    truncation: Union[bool, str, TruncationStrategy] = False,
    max_length: Optional[int] = None,
    stride: int = 0,
    is_split_into_words: bool = False,
    pad_to_multiple_of: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    return_token_type_ids: Optional[bool] = None,
    return_attention_mask: Optional[bool] = None,
    return_overflowing_tokens: bool = False,
    return_special_tokens_mask: bool = False,
    return_offsets_mapping: bool = False,
    return_length: bool = False,
    verbose: bool = True,
    **kwargs
  ) -> BatchEncoding:
    # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
    padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
      padding=padding,
      truncation=truncation,
      max_length=max_length,
      pad_to_multiple_of=pad_to_multiple_of,
      verbose=verbose,
      **kwargs,
    )

    return self._batch_encode_plus(
      batch_text_or_text_pairs=batch_text_or_text_pairs,
      add_special_tokens=add_special_tokens,
      padding_strategy=padding_strategy,
      truncation_strategy=truncation_strategy,
      max_length=max_length,
      stride=stride,
      is_split_into_words=is_split_into_words,
      pad_to_multiple_of=pad_to_multiple_of,
      return_tensors=return_tensors,
      return_token_type_ids=return_token_type_ids,
      return_attention_mask=return_attention_mask,
      return_overflowing_tokens=return_overflowing_tokens,
      return_special_tokens_mask=return_special_tokens_mask,
      return_offsets_mapping=return_offsets_mapping,
      return_length=return_length,
      verbose=verbose,
      **kwargs,
    )

  def _batch_encode_plus(
    self,
    batch_text_or_text_pairs: Union[
      List[TextInput],
      List[TextInputPair],
      List[PreTokenizedInput],
      List[PreTokenizedInputPair],
      List[EncodedInput],
      List[EncodedInputPair],
    ],
    add_special_tokens: bool = True,
    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
    truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
    max_length: Optional[int] = None,
    stride: int = 0,
    is_split_into_words: bool = False,
    pad_to_multiple_of: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    return_token_type_ids: Optional[bool] = None,
    return_attention_mask: Optional[bool] = None,
    return_overflowing_tokens: bool = False,
    return_special_tokens_mask: bool = False,
    return_offsets_mapping: bool = False,
    return_length: bool = False,
    verbose: bool = True,
    **kwargs
  ) -> BatchEncoding:
    raise NotImplementedError

  def pad(
    self,
    encoded_inputs: Union[
      BatchEncoding,
      List[BatchEncoding],
      Dict[str, EncodedInput],
      Dict[str, List[EncodedInput]],
      List[Dict[str, EncodedInput]],
    ],
    padding: Union[bool, str, PaddingStrategy] = True,
    max_length: Optional[int] = None,
    pad_to_multiple_of: Optional[int] = None,
    return_attention_mask: Optional[bool] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    verbose: bool = True,
  ) -> BatchEncoding:
    # If we have a list of dicts, let's convert it in a dict of lists
    # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
    if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
      encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}

    # The model's main input name, usually `input_ids`, has be passed for padding
    if self.model_input_names[0] not in encoded_inputs:
      raise ValueError(
        "You should supply an encoding or a list of encodings to this method"
        f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
      )

    required_input = encoded_inputs[self.model_input_names[0]]

    if not required_input:
      if return_attention_mask:
        encoded_inputs["attention_mask"] = []
      return encoded_inputs

    # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
    # and rebuild them afterwards if no return_tensors is specified
    # Note that we lose the specific device the tensor may be on for PyTorch

    first_element = required_input[0]
    if isinstance(first_element, (list, tuple)):
      # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
      index = 0
      while len(required_input[index]) == 0:
        index += 1
      if index < len(required_input):
        first_element = required_input[index][0]
    # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
    if not isinstance(first_element, (int, list, tuple)):
      if is_tf_available() and _is_tensorflow(first_element):
        return_tensors = "tf" if return_tensors is None else return_tensors
      elif is_torch_available() and _is_torch(first_element):
        return_tensors = "pt" if return_tensors is None else return_tensors
      elif isinstance(first_element, np.ndarray):
        return_tensors = "np" if return_tensors is None else return_tensors
      else:
        raise ValueError(
          f"type of {first_element} unknown: {type(first_element)}. "
          f"Should be one of a python, numpy, pytorch or tensorflow object."
        )

      for key, value in encoded_inputs.items():
        encoded_inputs[key] = to_py_obj(value)

    # Convert padding_strategy in PaddingStrategy
    padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
      padding=padding, max_length=max_length, verbose=verbose
    )

    required_input = encoded_inputs[self.model_input_names[0]]
    if required_input and not isinstance(required_input[0], (list, tuple)):
      encoded_inputs = self._pad(
        encoded_inputs,
        max_length=max_length,
        padding_strategy=padding_strategy,
        pad_to_multiple_of=pad_to_multiple_of,
        return_attention_mask=return_attention_mask,
      )
      return BatchEncoding(encoded_inputs, tensor_type=return_tensors)

    batch_size = len(required_input)
    assert all(
      len(v) == batch_size for v in encoded_inputs.values()
    ), "Some items in the output dictionary have a different batch size than others."

    if padding_strategy == PaddingStrategy.LONGEST:
      max_length = max(len(inputs) for inputs in required_input)
      padding_strategy = PaddingStrategy.MAX_LENGTH

    batch_outputs = {}
    for i in range(batch_size):
      inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
      outputs = self._pad(
        inputs,
        max_length=max_length,
        padding_strategy=padding_strategy,
        pad_to_multiple_of=pad_to_multiple_of,
        return_attention_mask=return_attention_mask,
      )

      for key, value in outputs.items():
        if key not in batch_outputs:
          batch_outputs[key] = []
        batch_outputs[key].append(value)

    return BatchEncoding(batch_outputs, tensor_type=return_tensors)

  def create_token_type_ids_from_sequences(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
  ) -> List[int]:
    if token_ids_1 is None:
      return len(token_ids_0) * [0]
    return [0] * len(token_ids_0) + [1] * len(token_ids_1)

  def build_inputs_with_special_tokens(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
  ) -> List[int]:
    if token_ids_1 is None:
      return token_ids_0
    return token_ids_0 + token_ids_1

  def prepare_for_model(
    self,
    ids: List[int],
    pair_ids: Optional[List[int]] = None,
    add_special_tokens: bool = True,
    padding: Union[bool, str, PaddingStrategy] = False,
    truncation: Union[bool, str, TruncationStrategy] = False,
    max_length: Optional[int] = None,
    stride: int = 0,
    pad_to_multiple_of: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    return_token_type_ids: Optional[bool] = None,
    return_attention_mask: Optional[bool] = None,
    return_overflowing_tokens: bool = False,
    return_special_tokens_mask: bool = False,
    return_offsets_mapping: bool = False,
    return_length: bool = False,
    verbose: bool = True,
    prepend_batch_axis: bool = False,
    **kwargs
  ) -> BatchEncoding:
    # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
    padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
      padding=padding,
      truncation=truncation,
      max_length=max_length,
      pad_to_multiple_of=pad_to_multiple_of,
      verbose=verbose,
      **kwargs,
    )

    pair = bool(pair_ids is not None)
    len_ids = len(ids)
    len_pair_ids = len(pair_ids) if pair else 0

    if return_token_type_ids and not add_special_tokens:
      raise ValueError(
        "Asking to return token_type_ids while setting add_special_tokens to False "
        "results in an undefined behavior. Please set add_special_tokens to True or "
        "set return_token_type_ids to None."
      )

    # Load from model defaults
    if return_token_type_ids is None:
      return_token_type_ids = "token_type_ids" in self.model_input_names
    if return_attention_mask is None:
      return_attention_mask = "attention_mask" in self.model_input_names

    encoded_inputs = {}

    # Compute the total size of the returned encodings
    total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)

    # Truncation: Handle max sequence length
    overflowing_tokens = []
    if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
      ids, pair_ids, overflowing_tokens = self.truncate_sequences(
        ids,
        pair_ids=pair_ids,
        num_tokens_to_remove=total_len - max_length,
        truncation_strategy=truncation_strategy,
        stride=stride,
      )

    if return_overflowing_tokens:
      encoded_inputs["overflowing_tokens"] = overflowing_tokens
      encoded_inputs["num_truncated_tokens"] = total_len - max_length

    # Add special tokens
    if add_special_tokens:
      sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
      token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
    else:
      sequence = ids + pair_ids if pair else ids
      token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])

    # Build output dictionary
    encoded_inputs["input_ids"] = sequence
    if return_token_type_ids:
      encoded_inputs["token_type_ids"] = token_type_ids
    if return_special_tokens_mask:
      if add_special_tokens:
        encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
      else:
        encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

    # Check lengths
    self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)

    # Padding
    if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
      encoded_inputs = self.pad(
        encoded_inputs,
        max_length=max_length,
        padding=padding_strategy.value,
        pad_to_multiple_of=pad_to_multiple_of,
        return_attention_mask=return_attention_mask,
      )

    if return_length:
      encoded_inputs["length"] = len(encoded_inputs["input_ids"])

    batch_outputs = BatchEncoding(
      encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
    )

    return batch_outputs

  def truncate_sequences(
    self,
    ids: List[int],
    pair_ids: Optional[List[int]] = None,
    num_tokens_to_remove: int = 0,
    truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
    stride: int = 0,
  ) -> Tuple[List[int], List[int], List[int]]:
    if num_tokens_to_remove <= 0:
      return ids, pair_ids, []

    if not isinstance(truncation_strategy, TruncationStrategy):
      truncation_strategy = TruncationStrategy(truncation_strategy)

    overflowing_tokens = []
    if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
      for _ in range(num_tokens_to_remove):
        if pair_ids is None or len(ids) > len(pair_ids):
          if not overflowing_tokens:
            window_len = min(len(ids), stride + 1)
          else:
            window_len = 1
          overflowing_tokens.extend(ids[-window_len:])
          ids = ids[:-1]
        else:
          if not overflowing_tokens:
            window_len = min(len(pair_ids), stride + 1)
          else:
            window_len = 1
          overflowing_tokens.extend(pair_ids[-window_len:])
          pair_ids = pair_ids[:-1]
    elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
      if len(ids) > num_tokens_to_remove:
        window_len = min(len(ids), stride + num_tokens_to_remove)
        overflowing_tokens = ids[-window_len:]
        ids = ids[:-num_tokens_to_remove]
    elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
      if len(pair_ids) > num_tokens_to_remove:
        window_len = min(len(pair_ids), stride + num_tokens_to_remove)
        overflowing_tokens = pair_ids[-window_len:]
        pair_ids = pair_ids[:-num_tokens_to_remove]

    return (ids, pair_ids, overflowing_tokens)

  def _pad(
    self,
    encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
    max_length: Optional[int] = None,
    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
    pad_to_multiple_of: Optional[int] = None,
    return_attention_mask: Optional[bool] = None,
  ) -> dict:
    # Load from model defaults
    if return_attention_mask is None:
      return_attention_mask = "attention_mask" in self.model_input_names

    required_input = encoded_inputs[self.model_input_names[0]]

    if padding_strategy == PaddingStrategy.LONGEST:
      max_length = len(required_input)

    if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
      max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

    needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length

    if needs_to_be_padded:
      difference = max_length - len(required_input)
      if self.padding_side == "right":
        if return_attention_mask:
          encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
        if "token_type_ids" in encoded_inputs:
          encoded_inputs["token_type_ids"] = (
            encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
          )
        if "special_tokens_mask" in encoded_inputs:
          encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
        encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
      elif self.padding_side == "left":
        if return_attention_mask:
          encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
        if "token_type_ids" in encoded_inputs:
          encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
            "token_type_ids"
          ]
        if "special_tokens_mask" in encoded_inputs:
          encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
        encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
      else:
        raise ValueError("Invalid padding strategy:" + str(self.padding_side))
    elif return_attention_mask and "attention_mask" not in encoded_inputs:
      encoded_inputs["attention_mask"] = [1] * len(required_input)

    return encoded_inputs

  def convert_tokens_to_string(self, tokens: List[str]) -> str:
    raise NotImplementedError

  def batch_decode(
    self,
    sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
    skip_special_tokens: bool = False,
    clean_up_tokenization_spaces: bool = True,
    **kwargs
  ) -> List[str]:
    return [
      self.decode(
        seq,
        skip_special_tokens=skip_special_tokens,
        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
        **kwargs,
      )
      for seq in sequences
    ]

  def decode(
    self,
    token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
    skip_special_tokens: bool = False,
    clean_up_tokenization_spaces: bool = True,
    **kwargs
  ) -> str:
    # Convert inputs to python lists
    token_ids = to_py_obj(token_ids)

    return self._decode(
      token_ids=token_ids,
      skip_special_tokens=skip_special_tokens,
      clean_up_tokenization_spaces=clean_up_tokenization_spaces,
      **kwargs,
    )

  def _decode(
    self,
    token_ids: Union[int, List[int]],
    skip_special_tokens: bool = False,
    clean_up_tokenization_spaces: bool = True,
    **kwargs
  ) -> str:
    raise NotImplementedError

  def get_special_tokens_mask(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
  ) -> List[int]:
    assert already_has_special_tokens and token_ids_1 is None, (
      "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
      "Please use a slow (full python) tokenizer to activate this argument."
      "Or set `return_special_tokens_mask=True` when calling the encoding method "
      "to get the special tokens mask in any tokenizer. "
    )

    all_special_ids = self.all_special_ids  # cache the property

    special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]

    return special_tokens_mask

  @staticmethod
  def clean_up_tokenization(out_string: str) -> str:
    """
    Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
    Args:
        out_string (:obj:`str`): The text to clean up.
    Returns:
        :obj:`str`: The cleaned-up string.
    """
    out_string = (
      out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

  def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
    if max_length is None and len(ids) > self.model_max_length and verbose:
      self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True

  @contextmanager
  def as_target_tokenizer(self):
    yield

  def prepare_seq2seq_batch(
    self,
    src_texts: List[str],
    tgt_texts: Optional[List[str]] = None,
    max_length: Optional[int] = None,
    max_target_length: Optional[int] = None,
    padding: str = "longest",
    return_tensors: str = None,
    truncation: bool = True,
    **kwargs,
  ) -> BatchEncoding:
    # mBART-specific kwargs that should be ignored by other models.
    kwargs.pop("src_lang", None)
    kwargs.pop("tgt_lang", None)
    if max_length is None:
      max_length = self.model_max_length
    model_inputs = self(
      src_texts,
      add_special_tokens=True,
      return_tensors=return_tensors,
      max_length=max_length,
      padding=padding,
      truncation=truncation,
      **kwargs,
    )
    if tgt_texts is None:
      return model_inputs
    # Process tgt_texts
    if max_target_length is None:
      max_target_length = max_length
    with self.as_target_tokenizer():
      labels = self(
        tgt_texts,
        add_special_tokens=True,
        return_tensors=return_tensors,
        padding=padding,
        max_length=max_target_length,
        truncation=truncation,
        **kwargs,
      )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


class PreTrainedTokenizer(PreTrainedTokenizerBase):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)
    # Added tokens - We store this for both slow and fast tokenizers
    # until the serialization of Fast tokenizers is updated
    self.added_tokens_encoder: Dict[str, int] = {}
    self.added_tokens_decoder: Dict[int, str] = {}
    self.unique_no_split_tokens: List[str] = []

  @property
  def is_fast(self) -> bool:
    return False

  @property
  def vocab_size(self) -> int:
    """
    :obj:`int`: Size of the base vocabulary (without the added tokens).
    """
    raise NotImplementedError

  def get_added_vocab(self) -> Dict[str, int]:
    """
    Returns the added tokens in the vocabulary as a dictionary of token to index.
    Returns:
        :obj:`Dict[str, int]`: The added tokens.
    """
    return self.added_tokens_encoder

  def __len__(self):
    """
    Size of the full vocabulary with the added tokens.
    """
    return self.vocab_size + len(self.added_tokens_encoder)

  def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
    """
    Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
    it with indices starting from length of the current vocabulary.
    Args:
        new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
            Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
            checking if the tokenizer assign the index of the ``unk_token`` to them).
        special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not the tokens should be added as special tokens.
    Returns:
        :obj:`int`: The number of tokens actually added to the vocabulary.
    Examples::
        # Let's see how to increase the vocabulary of Bert model and tokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
        print('We have added', num_added_toks, 'tokens')
        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
        model.resize_token_embeddings(len(tokenizer))
    """
    new_tokens = [str(tok) for tok in new_tokens]

    tokens_to_add = []
    for token in new_tokens:
      assert isinstance(token, str)
      if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
        token = token.lower()
      if (
        token != self.unk_token
        and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
        and token not in tokens_to_add
      ):
        tokens_to_add.append(token)

    added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
    added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
    self.added_tokens_encoder.update(added_tok_encoder)
    self.added_tokens_decoder.update(added_tok_decoder)

    # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
    if special_tokens:
      self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
    else:
      # Or on the newly added tokens
      self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))

    return len(tokens_to_add)

  def num_special_tokens_to_add(self, pair: bool = False) -> int:
    """
    Returns the number of added tokens when encoding a sequence with special tokens.
    .. note::
        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
        put this inside your training loop.
    Args:
        pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether the number of added tokens should be computed in the case of a sequence pair or a single
            sequence.
    Returns:
        :obj:`int`: Number of special tokens added to sequences.
    """
    token_ids_0 = []
    token_ids_1 = []
    return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))

  def tokenize(self, text: TextInput, **kwargs) -> List[str]:
    """
    Converts a string in a sequence of tokens, using the tokenizer.
    Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
    (BPE/SentencePieces/WordPieces). Takes care of added tokens.
    Args:
        text (:obj:`str`):
            The sequence to be encoded.
        **kwargs (additional keyword arguments):
            Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method.
    Returns:
        :obj:`List[str]`: The list of tokens.
    """
    # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
    all_special_tokens_extended = dict(
      (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
    )

    text, kwargs = self.prepare_for_tokenization(text, **kwargs)

    # TODO: should this be in the base class?
    if hasattr(self, "do_lower_case") and self.do_lower_case:
      # convert non-special tokens to lowercase
      escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
      pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
      text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)

    def split_on_token(tok, text):
      result = []
      tok_extended = all_special_tokens_extended.get(tok, None)
      split_text = text.split(tok)
      full_word = ""
      for i, sub_text in enumerate(split_text):
        # AddedToken can control whitespace stripping around them.
        # We use them for GPT2 and Roberta to have different behavior depending on the special token
        # Cf. https://github.com/huggingface/transformers/pull/2778
        # and https://github.com/huggingface/transformers/issues/3788
        if isinstance(tok_extended, AddedToken):
          if tok_extended.single_word:
            # Try to avoid splitting on token
            if (
              i < len(split_text) - 1
              and not _is_end_of_word(sub_text)
              and not _is_start_of_word(split_text[i + 1])
            ):
              # Don't extract the special token
              full_word += sub_text + tok
            elif full_word:
              full_word += sub_text
              result.append(full_word)
              full_word = ""
              continue
          # Strip white spaces on the right
          if tok_extended.rstrip and i > 0:
            # A bit counter-intuitive but we strip the left of the string
            # since tok_extended.rstrip means the special token is eating all white spaces on its right
            sub_text = sub_text.lstrip()
          # Strip white spaces on the left
          if tok_extended.lstrip and i < len(split_text) - 1:
            sub_text = sub_text.rstrip()  # Opposite here
        else:
          # We strip left and right by default
          if i < len(split_text) - 1:
            sub_text = sub_text.rstrip()
          if i > 0:
            sub_text = sub_text.lstrip()

        if i == 0 and not sub_text:
          result.append(tok)
        elif i == len(split_text) - 1:
          if sub_text:
            result.append(sub_text)
          else:
            pass
        else:
          if sub_text:
            result.append(sub_text)
          result.append(tok)
      return result

    def split_on_tokens(tok_list, text):
      if not text.strip():
        return []
      if not tok_list:
        return self._tokenize(text)

      tokenized_text = []
      text_list = [text]
      for tok in tok_list:
        tokenized_text = []
        for sub_text in text_list:
          if sub_text not in self.unique_no_split_tokens:
            tokenized_text.extend(split_on_token(tok, sub_text))
          else:
            tokenized_text.append(sub_text)
        text_list = tokenized_text

      return list(
        itertools.chain.from_iterable(
          (
            self._tokenize(token) if token not in self.unique_no_split_tokens else [token]
            for token in tokenized_text
          )
        )
      )

    no_split_token = self.unique_no_split_tokens
    tokenized_text = split_on_tokens(no_split_token, text)
    return tokenized_text

  def _tokenize(self, text, **kwargs):
    """
    Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
    vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
    Do NOT take care of added tokens.
    """
    raise NotImplementedError

  def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
    """
    Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
    vocabulary.
    Args:
        tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
    Returns:
        :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
    """
    if tokens is None:
      return None

    if isinstance(tokens, str):
      return self._convert_token_to_id_with_added_voc(tokens)

    ids = []
    for token in tokens:
      ids.append(self._convert_token_to_id_with_added_voc(token))
    return ids

  def _convert_token_to_id_with_added_voc(self, token):
    if token is None:
      return None

    if token in self.added_tokens_encoder:
      return self.added_tokens_encoder[token]
    return self._convert_token_to_id(token)

  def _convert_token_to_id(self, token):
    raise NotImplementedError

  def _encode_plus(
    self,
    text: Union[TextInput, PreTokenizedInput, EncodedInput],
    text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
    add_special_tokens: bool = True,
    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
    truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
    max_length: Optional[int] = None,
    stride: int = 0,
    is_split_into_words: bool = False,
    pad_to_multiple_of: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    return_token_type_ids: Optional[bool] = None,
    return_attention_mask: Optional[bool] = None,
    return_overflowing_tokens: bool = False,
    return_special_tokens_mask: bool = False,
    return_offsets_mapping: bool = False,
    return_length: bool = False,
    verbose: bool = True,
    **kwargs
  ) -> BatchEncoding:
    def get_input_ids(text):
      if isinstance(text, str):
        tokens = self.tokenize(text, **kwargs)
        return self.convert_tokens_to_ids(tokens)
      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
        if is_split_into_words:
          tokens = list(
            itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
          )
          return self.convert_tokens_to_ids(tokens)
        else:
          return self.convert_tokens_to_ids(text)
      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
        return text
      else:
        if is_split_into_words:
          raise ValueError(
            f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`."
          )
        else:
          raise ValueError(
            f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
          )

    if return_offsets_mapping:
      raise NotImplementedError(
        "return_offset_mapping is not available when using Python tokenizers."
        "To use this feature, change your tokenizer to one deriving from "
        "transformers.PreTrainedTokenizerFast."
        "More information on available tokenizers at "
        "https://github.com/huggingface/transformers/pull/2674"
      )

    first_ids = get_input_ids(text)
    second_ids = get_input_ids(text_pair) if text_pair is not None else None

    return self.prepare_for_model(
      first_ids,
      pair_ids=second_ids,
      add_special_tokens=add_special_tokens,
      padding=padding_strategy.value,
      truncation=truncation_strategy.value,
      max_length=max_length,
      stride=stride,
      pad_to_multiple_of=pad_to_multiple_of,
      return_tensors=return_tensors,
      prepend_batch_axis=True,
      return_attention_mask=return_attention_mask,
      return_token_type_ids=return_token_type_ids,
      return_overflowing_tokens=return_overflowing_tokens,
      return_special_tokens_mask=return_special_tokens_mask,
      return_length=return_length,
      verbose=verbose,
    )

  def _batch_encode_plus(
    self,
    batch_text_or_text_pairs: Union[
      List[TextInput],
      List[TextInputPair],
      List[PreTokenizedInput],
      List[PreTokenizedInputPair],
      List[EncodedInput],
      List[EncodedInputPair],
    ],
    add_special_tokens: bool = True,
    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
    truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
    max_length: Optional[int] = None,
    stride: int = 0,
    is_split_into_words: bool = False,
    pad_to_multiple_of: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    return_token_type_ids: Optional[bool] = None,
    return_attention_mask: Optional[bool] = None,
    return_overflowing_tokens: bool = False,
    return_special_tokens_mask: bool = False,
    return_offsets_mapping: bool = False,
    return_length: bool = False,
    verbose: bool = True,
    **kwargs
  ) -> BatchEncoding:
    def get_input_ids(text):
      if isinstance(text, str):
        tokens = self.tokenize(text, **kwargs)
        return self.convert_tokens_to_ids(tokens)
      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
        if is_split_into_words:
          tokens = list(
            itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
          )
          return self.convert_tokens_to_ids(tokens)
        else:
          return self.convert_tokens_to_ids(text)
      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
        return text
      else:
        raise ValueError(
          "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
        )

    if return_offsets_mapping:
      raise NotImplementedError(
        "return_offset_mapping is not available when using Python tokenizers."
        "To use this feature, change your tokenizer to one deriving from "
        "transformers.PreTrainedTokenizerFast."
      )

    input_ids = []
    for ids_or_pair_ids in batch_text_or_text_pairs:
      if not isinstance(ids_or_pair_ids, (list, tuple)):
        ids, pair_ids = ids_or_pair_ids, None
      elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
        ids, pair_ids = ids_or_pair_ids, None
      else:
        ids, pair_ids = ids_or_pair_ids

      first_ids = get_input_ids(ids)
      second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
      input_ids.append((first_ids, second_ids))

    batch_outputs = self._batch_prepare_for_model(
      input_ids,
      add_special_tokens=add_special_tokens,
      padding_strategy=padding_strategy,
      truncation_strategy=truncation_strategy,
      max_length=max_length,
      stride=stride,
      pad_to_multiple_of=pad_to_multiple_of,
      return_attention_mask=return_attention_mask,
      return_token_type_ids=return_token_type_ids,
      return_overflowing_tokens=return_overflowing_tokens,
      return_special_tokens_mask=return_special_tokens_mask,
      return_length=return_length,
      return_tensors=return_tensors,
      verbose=verbose,
    )

    return BatchEncoding(batch_outputs)

  def _batch_prepare_for_model(
    self,
    batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
    add_special_tokens: bool = True,
    padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
    truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
    max_length: Optional[int] = None,
    stride: int = 0,
    pad_to_multiple_of: Optional[int] = None,
    return_tensors: Optional[str] = None,
    return_token_type_ids: Optional[bool] = None,
    return_attention_mask: Optional[bool] = None,
    return_overflowing_tokens: bool = False,
    return_special_tokens_mask: bool = False,
    return_length: bool = False,
    verbose: bool = True,
  ) -> BatchEncoding:
    """
    Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
    adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
    manages a moving window (with user defined stride) for overflowing tokens
    Args:
        batch_ids_pairs: list of tokenized input ids or input ids pairs
    """

    batch_outputs = {}
    for first_ids, second_ids in batch_ids_pairs:
      outputs = self.prepare_for_model(
        first_ids,
        second_ids,
        add_special_tokens=add_special_tokens,
        padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
        truncation=truncation_strategy.value,
        max_length=max_length,
        stride=stride,
        pad_to_multiple_of=None,  # we pad in batch afterward
        return_attention_mask=False,  # we pad in batch afterward
        return_token_type_ids=return_token_type_ids,
        return_overflowing_tokens=return_overflowing_tokens,
        return_special_tokens_mask=return_special_tokens_mask,
        return_length=return_length,
        return_tensors=None,  # We convert the whole batch to tensors at the end
        prepend_batch_axis=False,
        verbose=verbose,
      )

      for key, value in outputs.items():
        if key not in batch_outputs:
          batch_outputs[key] = []
        batch_outputs[key].append(value)

    batch_outputs = self.pad(
      batch_outputs,
      padding=padding_strategy.value,
      max_length=max_length,
      pad_to_multiple_of=pad_to_multiple_of,
      return_attention_mask=return_attention_mask,
    )

    batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)

    return batch_outputs

  def prepare_for_tokenization(
    self, text: str, is_split_into_words: bool = False, **kwargs
  ) -> Tuple[str, Dict[str, Any]]:
    """
    Performs any necessary transformations before tokenization.
    This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. We test the
    :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
    Args:
        text (:obj:`str`):
            The text to prepare.
        is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not the text has been pretokenized.
        kwargs:
            Keyword arguments to use for the tokenization.
    Returns:
        :obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
    """
    return (text, kwargs)

  def get_special_tokens_mask(
    self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
  ) -> List[int]:
    """
    Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
    special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
    Args:
        token_ids_0 (:obj:`List[int]`):
            List of ids of the first sequence.
        token_ids_1 (:obj:`List[int]`, `optional`):
            List of ids of the second sequence.
        already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not the token list is already formatted with special tokens for the model.
    Returns:
        A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
    """
    return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))

  @overload
  def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str:
    ...

  @overload
  def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
    ...

  def convert_ids_to_tokens(
    self, ids: Union[int, List[int]], skip_special_tokens: bool = False
  ) -> Union[str, List[str]]:
    """
    Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
    added tokens.
    Args:
        ids (:obj:`int` or :obj:`List[int]`):
            The token id (or token ids) to convert to tokens.
        skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to remove special tokens in the decoding.
    Returns:
        :obj:`str` or :obj:`List[str]`: The decoded token(s).
    """
    if isinstance(ids, int):
      if ids in self.added_tokens_decoder:
        return self.added_tokens_decoder[ids]
      else:
        return self._convert_id_to_token(ids)
    tokens = []
    for index in ids:
      index = int(index)
      if skip_special_tokens and index in self.all_special_ids:
        continue
      if index in self.added_tokens_decoder:
        tokens.append(self.added_tokens_decoder[index])
      else:
        tokens.append(self._convert_id_to_token(index))
    return tokens

  def _convert_id_to_token(self, index: int) -> str:
    raise NotImplementedError

  def convert_tokens_to_string(self, tokens: List[str]) -> str:
    return " ".join(tokens)

  def _decode(
    self,
    token_ids: List[int],
    skip_special_tokens: bool = False,
    clean_up_tokenization_spaces: bool = True,
    spaces_between_special_tokens: bool = True,
  ) -> str:
    filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)

    # To avoid mixing byte-level and unicode for byte-level BPT
    # we need to build string separately for added tokens and byte-level tokens
    # cf. https://github.com/huggingface/transformers/issues/1133
    sub_texts = []
    current_sub_text = []
    for token in filtered_tokens:
      if skip_special_tokens and token in self.all_special_ids:
        continue
      if token in self.added_tokens_encoder:
        if current_sub_text:
          sub_texts.append(self.convert_tokens_to_string(current_sub_text))
          current_sub_text = []
        sub_texts.append(token)
      else:
        current_sub_text.append(token)
    if current_sub_text:
      sub_texts.append(self.convert_tokens_to_string(current_sub_text))

    if spaces_between_special_tokens:
      text = " ".join(sub_texts)
    else:
      text = "".join(sub_texts)

    if clean_up_tokenization_spaces:
      clean_text = self.clean_up_tokenization(text)
      return clean_text
    else:
      return text


class BertTokenizer(PreTrainedTokenizer):
  vocab_files_names = VOCAB_FILES_NAMES
  pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
  pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
  max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

  def __init__(
    self,
    vocab_file,
    do_lower_case=True,
    do_basic_tokenize=True,
    never_split=None,
    unk_token="[UNK]",
    sep_token="[SEP]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    mask_token="[MASK]",
    tokenize_chinese_chars=True,
    strip_accents=None,
    **kwargs
  ):
    super().__init__(
      do_lower_case=do_lower_case,
      do_basic_tokenize=do_basic_tokenize,
      never_split=never_split,
      unk_token=unk_token,
      sep_token=sep_token,
      pad_token=pad_token,
      cls_token=cls_token,
      mask_token=mask_token,
      tokenize_chinese_chars=tokenize_chinese_chars,
      strip_accents=strip_accents,
      **kwargs,
    )
    self.vocab = load_vocab(vocab_file)
    self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
    self.do_basic_tokenize = do_basic_tokenize
    if do_basic_tokenize:
      self.basic_tokenizer = BasicTokenizer(
        do_lower_case=do_lower_case,
        never_split=never_split,
        tokenize_chinese_chars=tokenize_chinese_chars,
        strip_accents=strip_accents,
      )
    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)

  @property
  def do_lower_case(self):
    return self.basic_tokenizer.do_lower_case

  @property
  def vocab_size(self):
    return len(self.vocab)

  def get_vocab(self):
    return dict(self.vocab, **self.added_tokens_encoder)

  def _tokenize(self, text):
    split_tokens = []
    if self.do_basic_tokenize:
      for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):

        # If the token is part of the never_split set
        if token in self.basic_tokenizer.never_split:
          split_tokens.append(token)
        else:
          split_tokens += self.wordpiece_tokenizer.tokenize(token)
    else:
      split_tokens = self.wordpiece_tokenizer.tokenize(text)
    return split_tokens

  def _convert_token_to_id(self, token):
    return self.vocab.get(token, self.vocab.get(self.unk_token))

  def _convert_id_to_token(self, index):
    return self.ids_to_tokens.get(index, self.unk_token)

  def convert_tokens_to_string(self, tokens):
    out_string = " ".join(tokens).replace(" ##", "").strip()
    return out_string

  def build_inputs_with_special_tokens(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
  ) -> List[int]:
    if token_ids_1 is None:
      return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
    cls = [self.cls_token_id]
    sep = [self.sep_token_id]
    return cls + token_ids_0 + sep + token_ids_1 + sep

  def get_special_tokens_mask(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
  ) -> List[int]:
    if already_has_special_tokens:
      if token_ids_1 is not None:
        raise ValueError(
          "You should not supply a second sequence if the provided sequence of "
          "ids is already formatted with special tokens for the model."
        )
      return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

    if token_ids_1 is not None:
      return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
    return [1] + ([0] * len(token_ids_0)) + [1]

  def create_token_type_ids_from_sequences(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
  ) -> List[int]:
    sep = [self.sep_token_id]
    cls = [self.cls_token_id]
    if token_ids_1 is None:
      return len(cls + token_ids_0 + sep) * [0]
    return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

  def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
    index = 0
    if os.path.isdir(save_directory):
      vocab_file = os.path.join(
        save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
      )
    else:
      vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
    with open(vocab_file, "w", encoding="utf-8") as writer:
      for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
        if index != token_index:
          index = token_index
        writer.write(token + "\n")
        index += 1
    return (vocab_file,)


class BasicTokenizer(object):
  def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
    if never_split is None:
      never_split = []
    self.do_lower_case = do_lower_case
    self.never_split = set(never_split)
    self.tokenize_chinese_chars = tokenize_chinese_chars
    self.strip_accents = strip_accents

  def tokenize(self, text, never_split=None):
    # union() returns a new set by concatenating the two sets.
    never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
    text = self._clean_text(text)

    # This was added on November 1st, 2018 for the multilingual and Chinese
    # models. This is also applied to the English models now, but it doesn't
    # matter since the English models were not trained on any Chinese data
    # and generally don't have any Chinese data in them (there are Chinese
    # characters in the vocabulary because Wikipedia does have some Chinese
    # words in the English Wikipedia.).
    if self.tokenize_chinese_chars:
      text = self._tokenize_chinese_chars(text)
    orig_tokens = whitespace_tokenize(text)
    split_tokens = []
    for token in orig_tokens:
      if token not in never_split:
        if self.do_lower_case:
          token = token.lower()
          if self.strip_accents is not False:
            token = self._run_strip_accents(token)
        elif self.strip_accents:
          token = self._run_strip_accents(token)
      split_tokens.extend(self._run_split_on_punc(token, never_split))

    output_tokens = whitespace_tokenize(" ".join(split_tokens))
    return output_tokens

  def _run_strip_accents(self, text):
    text = unicodedata.normalize("NFD", text)
    output = []
    for char in text:
      cat = unicodedata.category(char)
      if cat == "Mn":
        continue
      output.append(char)
    return "".join(output)

  def _run_split_on_punc(self, text, never_split=None):
    if never_split is not None and text in never_split:
      return [text]
    chars = list(text)
    i = 0
    start_new_word = True
    output = []
    while i < len(chars):
      char = chars[i]
      if _is_punctuation(char):
        output.append([char])
        start_new_word = True
      else:
        if start_new_word:
          output.append([])
        start_new_word = False
        output[-1].append(char)
      i += 1

    return ["".join(x) for x in output]

  def _tokenize_chinese_chars(self, text):
    output = []
    for char in text:
      cp = ord(char)
      if self._is_chinese_char(cp):
        output.append(" ")
        output.append(char)
        output.append(" ")
      else:
        output.append(char)
    return "".join(output)

  def _is_chinese_char(self, cp):
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    if (
      (cp >= 0x4E00 and cp <= 0x9FFF)
      or (cp >= 0x3400 and cp <= 0x4DBF)  #
      or (cp >= 0x20000 and cp <= 0x2A6DF)  #
      or (cp >= 0x2A700 and cp <= 0x2B73F)  #
      or (cp >= 0x2B740 and cp <= 0x2B81F)  #
      or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
      or (cp >= 0xF900 and cp <= 0xFAFF)
      or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
    ):  #
      return True

    return False

  def _clean_text(self, text):
    output = []
    for char in text:
      cp = ord(char)
      if cp == 0 or cp == 0xFFFD or _is_control(char):
        continue
      if _is_whitespace(char):
        output.append(" ")
      else:
        output.append(char)
    return "".join(output)


class WordpieceTokenizer(object):
  def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
    self.vocab = vocab
    self.unk_token = unk_token
    self.max_input_chars_per_word = max_input_chars_per_word

  def tokenize(self, text):
    output_tokens = []
    for token in whitespace_tokenize(text):
      chars = list(token)
      if len(chars) > self.max_input_chars_per_word:
        output_tokens.append(self.unk_token)
        continue

      is_bad = False
      start = 0
      sub_tokens = []
      while start < len(chars):
        end = len(chars)
        cur_substr = None
        while start < end:
          substr = "".join(chars[start:end])
          if start > 0:
            substr = "##" + substr
          if substr in self.vocab:
            cur_substr = substr
            break
          end -= 1
        if cur_substr is None:
          is_bad = True
          break
        sub_tokens.append(cur_substr)
        start = end

      if is_bad:
        output_tokens.append(self.unk_token)
      else:
        output_tokens.extend(sub_tokens)
    return output_tokens