From fce75bad35281fd9eb782774a578b9143be1c2dc Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 23 Sep 2024 18:14:46 +0800 Subject: [PATCH 01/13] fix get_ppl --- lmdeploy/serve/utils.py | 42 +++++++++++------------------ lmdeploy/turbomind/deploy/config.py | 4 +++ 2 files changed, 20 insertions(+), 26 deletions(-) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 8945a71a7f..a651b40b50 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -180,42 +180,32 @@ def get_ppl(self, input_ids: Union[List[int], List[List[int]]]): input_ids (Union[List[int], List[List[int]]]): the batch of input token ids """ - assert len(input_ids) > 0 + assert isinstance(input_ids, List) and len(input_ids) > 0 if isinstance(input_ids[0], int): input_ids = [input_ids] - for input_id in input_ids: - assert len(input_id) > 1 + assert all(len(_) > 1 for _ in input_ids) - max_input_len = self.backend_config.max_prefill_token_num - n_max_iter = np.ceil( - max([len(input_id) - for input_id in input_ids]) / max_input_len).astype(int) + bs = len(input_ids) + max_seq_len = max([len(input_id) for input_id in input_ids]) - index_range_starts = [] - index_range_ends = [] - for input_id in input_ids: - index_range_start = np.array( - [i * max_input_len for i in range(n_max_iter)]) - index_range_end = index_range_start + max_input_len - index_range_start[index_range_start >= len(input_id)] = len( - input_id) - index_range_end[index_range_end >= len(input_id)] = len(input_id) - index_range_starts.append(index_range_start) - index_range_ends.append(index_range_end) + # TODO: a better way to determine `max_input_len` + # At most allocate 2G mem for logits with shape [bs, seq, vocab_size] + vocab_size = self.hf_tm_cfg.vocab_size + max_input_len = 2 * 1024**3 // (bs * vocab_size * 4) generator = self.engine.create_instance() all_loss_matrix = [] all_target_mask = [] - for i in range(n_max_iter): - steps = [start[i] for start in index_range_starts] + for i in range(0, max_seq_len, max_input_len - 1): _input_ids = [ - input_id[start[i]:end[i]] for input_id, start, end in zip( - input_ids, index_range_starts, index_range_ends) + input_id[i:i + max_input_len] for input_id in input_ids ] - _logits = generator.decode(_input_ids, - steps=steps, - sequence_start=(i == 0), - sequence_end=(i == n_max_iter - 1)) + steps = [i] * bs + _logits = generator.decode( + _input_ids, + steps=steps, + sequence_start=(i == 0), + sequence_end=(i + max_input_len >= max_seq_len)) _logits = _logits.float().cpu() padding_token_id = -100 target_ids = [(x + [padding_token_id])[1:] for x in _input_ids] diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py index bec6120b7b..4ee464e46d 100644 --- a/lmdeploy/turbomind/deploy/config.py +++ b/lmdeploy/turbomind/deploy/config.py @@ -135,5 +135,9 @@ def weight_type(self): def group_size(self): return self.model_config.group_size + @property + def vocab_size(self): + return self.model_config.vocab_size + def __str__(self): return json.dumps(self.to_dict(), indent=2) From a88c4c958a214f3a4d8b26cb6b048b972c3e1e97 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 23 Sep 2024 18:29:01 +0800 Subject: [PATCH 02/13] update --- lmdeploy/serve/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index a651b40b50..c7855b1bda 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -196,6 +196,13 @@ def get_ppl(self, input_ids: Union[List[int], List[List[int]]]): generator = self.engine.create_instance() all_loss_matrix = [] all_target_mask = [] + # suppose input_ids is [0,1,2,3,4,5,6,7,8], and max_input_len=5 + # In the first iter, tokens [0,1,2,3,4] are prefilled. + # loss=cross_entropy(logits[..., :-1, :], token_ids[1,2,3,4]) + # In the 2nd iter, token [4,5,6,7,8] should be prefilled. + # The first token must be the latest one in prev iter, because + # token_ids (or labels) have to be shifted the mostleft token + # loss=cross_entropy(logits[..., :-1, :], token_ids[5,6,7,8]) for i in range(0, max_seq_len, max_input_len - 1): _input_ids = [ input_id[i:i + max_input_len] for input_id in input_ids From 5c115350c0d1e8a2d630f403d93c89e920e085f4 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Mon, 23 Sep 2024 21:55:36 +0800 Subject: [PATCH 03/13] update --- lmdeploy/pytorch/engine/engine_instance.py | 99 ++++++++++++++- lmdeploy/serve/utils.py | 80 ++---------- lmdeploy/turbomind/turbomind.py | 136 ++++++++++----------- 3 files changed, 175 insertions(+), 140 deletions(-) diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py index 0e8a1ff6b5..cc8b338881 100644 --- a/lmdeploy/pytorch/engine/engine_instance.py +++ b/lmdeploy/pytorch/engine/engine_instance.py @@ -1,5 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List +from typing import List, Union + +import torch +from torch.nn.utils.rnn import pad_sequence from lmdeploy.messages import EngineOutput, GenerationConfig from lmdeploy.utils import get_logger @@ -583,3 +586,97 @@ def __add_messages(session_ids, input_ids, adapter_names, self.end(sid) return ret + + def get_ppl(self, input_ids: Union[List[int], List[List[int]]]): + """Get perplexity scores given a list of input tokens. + + Args: + input_ids (Union[List[int], List[List[int]]]): the batch of + input token ids + """ + assert isinstance(input_ids, List) and len(input_ids) > 0 + if isinstance(input_ids[0], int): + input_ids = [input_ids] + assert all(len(_) > 1 for _ in input_ids) + + def get_logits(input_ids, + sequence_start, + sequence_end, + pre_iter_logits=None): + logits = self.decode(input_ids=input_ids, + sequence_start=sequence_start, + sequence_end=sequence_end) + logits = logits.float().cpu() + padding_token_id = -100 + if pre_iter_logits is None: + _logits = logits + target_ids = [(x + [padding_token_id])[1:] for x in input_ids] + target_ids = [ + torch.Tensor(torch.LongTensor(_target_ids)) + for _target_ids in target_ids + ] + else: + # concat the logit of the last token in previous prefill iter, + # and shift the logit of the last token in this iter + _logits = torch.concat((pre_iter_logits[..., -1:, :], logits), + dim=1) + _logits = _logits[..., :-1, :] + target_ids = [ + torch.Tensor(torch.LongTensor(_target_ids)) + for _target_ids in input_ids + ] + target_ids = pad_sequence(target_ids, + batch_first=True, + padding_value=padding_token_id) + target_ids = target_ids.to(logits.device) + target_mask = target_ids != padding_token_id + + # compute cross entropy loss + bsz, seq_len, vocab_size = logits.shape + flat_logits = _logits.contiguous().view(-1, vocab_size) + flat_target_ids = target_ids.contiguous().view(-1) + flat_loss_matrix = torch.nn.functional.cross_entropy( + flat_logits, + flat_target_ids, + reduction='none', + ignore_index=padding_token_id) + return logits, flat_loss_matrix.view(bsz, seq_len), target_mask + + bs = len(input_ids) + max_seq_len = max([len(input_id) for input_id in input_ids]) + + # TODO: a better way to determine `max_input_len` + # At most allocate 2G mem for logits with shape [bs, seq, vocab_size] + vocab_size = self.engine.model_config.vocab_size + max_input_len = 2 * 1024**3 // (bs * vocab_size * 4) + + all_loss_matrix = [] + all_target_mask = [] + # the 1st prefill iter + _input_ids = [input_id[0:max_input_len] for input_id in input_ids] + logits, loss_matrix, target_mask = get_logits(input_ids=_input_ids, + sequence_start=True, + sequence_end=False) + all_loss_matrix.append(loss_matrix) + all_target_mask.append(target_mask) + + # the following prefill iters + for i in range(max_input_len, max_seq_len, max_input_len): + _input_ids = [ + input_id[i:i + max_input_len] for input_id in input_ids + ] + logits, loss_matrix, target_mask = get_logits( + input_ids=_input_ids, + sequence_start=False, + sequence_end=(i + max_input_len >= max_seq_len), + pre_iter_logits=logits) + all_loss_matrix.append(loss_matrix) + all_target_mask.append(target_mask) + + all_loss_matrix = torch.cat(all_loss_matrix, dim=1) + all_target_mask = torch.cat(all_target_mask, dim=1) + target_count = torch.sum(all_target_mask, dim=-1) + loss_sum = torch.sum(all_loss_matrix * all_target_mask, dim=1) + loss_avg = loss_sum / target_count + loss_avg = loss_avg.cpu().numpy() + return loss_avg diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index c7855b1bda..ef4b28efd9 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -4,7 +4,6 @@ import numpy as np import torch -from torch.nn.utils.rnn import pad_sequence from lmdeploy.utils import get_logger @@ -173,76 +172,17 @@ def _split_embeddings(input_ids, niter, iter_len, embeddings, logits = torch.cat(logits, dim=1) return logits - def get_ppl(self, input_ids: Union[List[int], List[List[int]]]): - """Get perplexity scores given a list of input tokens. + def get_ppl(self, inputs: List[str]) -> List[float]: + """Get perplexity scores given a list of inputs. Args: - input_ids (Union[List[int], List[List[int]]]): the batch of - input token ids - """ - assert isinstance(input_ids, List) and len(input_ids) > 0 - if isinstance(input_ids[0], int): - input_ids = [input_ids] - assert all(len(_) > 1 for _ in input_ids) - - bs = len(input_ids) - max_seq_len = max([len(input_id) for input_id in input_ids]) - - # TODO: a better way to determine `max_input_len` - # At most allocate 2G mem for logits with shape [bs, seq, vocab_size] - vocab_size = self.hf_tm_cfg.vocab_size - max_input_len = 2 * 1024**3 // (bs * vocab_size * 4) + inputs (List[str]): A list of strings. + Returns: + List[float]: A list of perplexity scores. + """ + if isinstance(inputs, str): + inputs = [inputs] + input_ids = [self.tokenizer.encode(text) for text in inputs] generator = self.engine.create_instance() - all_loss_matrix = [] - all_target_mask = [] - # suppose input_ids is [0,1,2,3,4,5,6,7,8], and max_input_len=5 - # In the first iter, tokens [0,1,2,3,4] are prefilled. - # loss=cross_entropy(logits[..., :-1, :], token_ids[1,2,3,4]) - # In the 2nd iter, token [4,5,6,7,8] should be prefilled. - # The first token must be the latest one in prev iter, because - # token_ids (or labels) have to be shifted the mostleft token - # loss=cross_entropy(logits[..., :-1, :], token_ids[5,6,7,8]) - for i in range(0, max_seq_len, max_input_len - 1): - _input_ids = [ - input_id[i:i + max_input_len] for input_id in input_ids - ] - steps = [i] * bs - _logits = generator.decode( - _input_ids, - steps=steps, - sequence_start=(i == 0), - sequence_end=(i + max_input_len >= max_seq_len)) - _logits = _logits.float().cpu() - padding_token_id = -100 - target_ids = [(x + [padding_token_id])[1:] for x in _input_ids] - target_ids = [ - torch.Tensor(torch.LongTensor(_target_ids)) - for _target_ids in target_ids - ] - target_ids = pad_sequence(target_ids, - batch_first=True, - padding_value=padding_token_id) - target_ids = target_ids.to(_logits.device) - target_mask = target_ids != padding_token_id - target_count = torch.sum(target_mask, dim=-1) - # compute cross entropy loss - bsz, seq_len, vocab_size = _logits.shape - flat_logits = _logits.contiguous().view(-1, vocab_size) - flat_target_ids = target_ids.contiguous().view(-1) - flat_loss_matrix = torch.nn.functional.cross_entropy( - flat_logits, - flat_target_ids, - reduction='none', - ignore_index=padding_token_id) - - all_loss_matrix.append(flat_loss_matrix.view(bsz, seq_len)) - all_target_mask.append(target_mask) - - all_loss_matrix = torch.cat(all_loss_matrix, dim=1) - all_target_mask = torch.cat(all_target_mask, dim=1) - target_count = torch.sum(all_target_mask, dim=-1) - loss_sum = torch.sum(all_loss_matrix * all_target_mask, dim=1) - loss_avg = loss_sum / target_count - loss_avg = loss_avg.cpu().numpy() - return loss_avg + return generator.get_ppl(input_ids) diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index 8439c4a816..ab23649f72 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -314,7 +314,7 @@ def create_instance(self, cuda_stream_id=0): Returns: TurboMindInstance: an instance of turbomind """ - return TurboMindInstance(self, cuda_stream_id) + return TurboMindInstance(self, self.config, cuda_stream_id) class TurboMindInstance: @@ -325,7 +325,10 @@ class TurboMindInstance: cuda_stream_id(int): identity of a cuda stream """ - def __init__(self, tm_model: TurboMind, cuda_stream_id: int = 0): + def __init__(self, + tm_model: TurboMind, + config: TurbomindModelConfig, + cuda_stream_id: int = 0): self.tm_model = tm_model self.cuda_stream_id = cuda_stream_id @@ -343,6 +346,7 @@ def __init__(self, tm_model: TurboMind, cuda_stream_id: int = 0): self.que = Queue() self.executor: ThreadPoolExecutor = None self.future = None + self.config = config def _create_model_instance(self, device_id): rank = self.node_id * self.gpu_count + device_id @@ -903,78 +907,72 @@ def get_ppl(self, input_ids: Union[List[int], List[List[int]]]): """Get perplexity scores given a list of input tokens. Args: - input_ids (Union[List[int], List[List[int]]]): the batch of input token ids - """ # noqa 501 + input_ids (Union[List[int], List[List[int]]]): the batch of + input token ids + """ - if len(input_ids) == 0: - input_ids = [[]] + assert isinstance(input_ids, List) and len(input_ids) > 0 if isinstance(input_ids[0], int): input_ids = [input_ids] - - max_input_len = 16 * 1024 - # max_input_len = 16 - n_max_iter = np.ceil( - max([len(input_id) - for input_id in input_ids]) / max_input_len).astype(int) - - device = 'cpu' if n_max_iter > 1 else 'cuda' - - index_range_starts = [] - index_range_ends = [] - for input_id in input_ids: - index_range_start = np.array( - [i * max_input_len for i in range(n_max_iter)]) - index_range_end = index_range_start + max_input_len - index_range_start[index_range_start >= len(input_id)] = len( - input_id) - index_range_end[index_range_end >= len(input_id)] = len(input_id) - index_range_starts.append(index_range_start) - index_range_ends.append(index_range_end) - - logits = [] - for i in range(n_max_iter): - steps = [start[i] for start in index_range_starts] + assert all(len(_) > 1 for _ in input_ids) + + bs = len(input_ids) + max_seq_len = max([len(input_id) for input_id in input_ids]) + + # TODO: a better way to determine `max_input_len` + # At most allocate 2G mem for logits with shape [bs, seq, vocab_size] + vocab_size = self.config.vocab_size + max_input_len = 2 * 1024**3 // (bs * vocab_size * 4) + + all_loss_matrix = [] + all_target_mask = [] + # suppose input_ids is [0,1,2,3,4,5,6,7,8], and max_input_len=5 + # In the first iter, tokens [0,1,2,3,4] are prefilled. + # loss=cross_entropy(logits[..., :-1, :], token_ids[1,2,3,4]) + # In the 2nd iter, token [4,5,6,7,8] should be prefilled. + # The first token must be the latest one in prev iter, because + # token_ids (or labels) have to be shifted the mostleft token + # loss=cross_entropy(logits[..., :-1, :], token_ids[5,6,7,8]) + for i in range(0, max_seq_len, max_input_len - 1): _input_ids = [ - input_id[start[i]:end[i]] for input_id, start, end in zip( - input_ids, index_range_starts, index_range_ends) + input_id[i:i + max_input_len] for input_id in input_ids + ] + steps = [i] * bs + _logits = self.decode( + _input_ids, + steps=steps, + sequence_start=(i == 0), + sequence_end=(i + max_input_len >= max_seq_len)) + _logits = _logits.float().cpu() + padding_token_id = -100 + target_ids = [(x + [padding_token_id])[1:] for x in _input_ids] + target_ids = [ + torch.Tensor(torch.LongTensor(_target_ids)) + for _target_ids in target_ids ] - _logits = self.decode(_input_ids, - steps, - sequence_start=(i == 0), - sequence_end=(i == n_max_iter - 1)) - _logits = _logits.to(device=device) - logits.append(_logits) - - # concat logits. Shape is [bsz, seq_len, vocab_size] - logits = torch.cat(logits, dim=1) - - # get target ids - padding_token_id = -100 - target_ids = [(_input_ids + [padding_token_id])[1:] - for _input_ids in input_ids] - target_ids = [ - torch.Tensor(torch.LongTensor(_target_ids)) - for _target_ids in target_ids - ] - target_ids = pad_sequence(target_ids, - batch_first=True, - padding_value=padding_token_id) - target_ids = target_ids.to(logits.device) - target_mask = target_ids != padding_token_id - target_count = torch.sum(target_mask, dim=-1) - - # compute cross entropy loss - bsz, seq_len, vocab_size = logits.shape - flat_logits = logits.contiguous().view(-1, vocab_size) - flat_target_ids = target_ids.contiguous().view(-1) - flat_loss_matrix = torch.nn.functional.cross_entropy( - flat_logits, - flat_target_ids, - reduction='none', - ignore_index=padding_token_id) - - loss_matrix = flat_loss_matrix.view(bsz, seq_len) - loss_sum = torch.sum(loss_matrix * target_mask, dim=1) + target_ids = pad_sequence(target_ids, + batch_first=True, + padding_value=padding_token_id) + target_ids = target_ids.to(_logits.device) + target_mask = target_ids != padding_token_id + target_count = torch.sum(target_mask, dim=-1) + # compute cross entropy loss + bsz, seq_len, vocab_size = _logits.shape + flat_logits = _logits.contiguous().view(-1, vocab_size) + flat_target_ids = target_ids.contiguous().view(-1) + flat_loss_matrix = torch.nn.functional.cross_entropy( + flat_logits, + flat_target_ids, + reduction='none', + ignore_index=padding_token_id) + + all_loss_matrix.append(flat_loss_matrix.view(bsz, seq_len)) + all_target_mask.append(target_mask) + + all_loss_matrix = torch.cat(all_loss_matrix, dim=1) + all_target_mask = torch.cat(all_target_mask, dim=1) + target_count = torch.sum(all_target_mask, dim=-1) + loss_sum = torch.sum(all_loss_matrix * all_target_mask, dim=1) loss_avg = loss_sum / target_count loss_avg = loss_avg.cpu().numpy() return loss_avg From d5c5f39c8ae69be6062d5377b153d6083ac4d953 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 24 Sep 2024 11:25:40 +0800 Subject: [PATCH 04/13] remove get_ppl from engine.py --- lmdeploy/pytorch/engine/engine.py | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py index 3044635f22..660bfc0348 100644 --- a/lmdeploy/pytorch/engine/engine.py +++ b/lmdeploy/pytorch/engine/engine.py @@ -1046,34 +1046,3 @@ async def async_end(self, session_id: int): def end(self, session_id: int): """Add new session.""" return self.engine_instance.end(session_id) - - def decode(self, - input_ids, - input_embeddings: List[InputEmbeddingType] = None, - input_embedding_ranges: List[InputEmbeddingRangeType] = None, - steps: List[int] = None, - sequence_start: bool = True, - sequence_end: bool = True, - adapter_names: List[str] = None): - """Perform context decode on input tokens. - - Args: - input_ids (List[List[int]] | List[np.ndaray]): the batch of input - token ids - steps (List[int]): the offset of the k/v cache - input_embeddings (List[List[Union[torch.Tensor, np.ndarray]]]): - embeddings features - input_embedding_ranges: (List[List[Tuple[int, int]]]): - the begin/end offsets of input_embeddings to input_ids - sequence_start (bool): indicator for starting a sequence - sequence_end (bool): indicator for ending a sequence - adapter_names (List[str]): The name of the adapters. - """ - return self.engine_instance.decode( - input_ids, - input_embeddings=input_embeddings, - input_embedding_ranges=input_embedding_ranges, - steps=steps, - sequence_start=sequence_start, - sequence_end=sequence_end, - adapter_names=adapter_names) From db15e4197e671b88292d007c423659fe72aa3e08 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 24 Sep 2024 16:15:26 +0800 Subject: [PATCH 05/13] fix according to reviewer comments --- docs/en/advance/long_context.md | 8 +- docs/en/llm/pipeline.md | 16 ++-- docs/zh_cn/advance/long_context.md | 8 +- docs/zh_cn/llm/pipeline.md | 16 ++-- lmdeploy/pytorch/engine/engine_instance.py | 99 +--------------------- lmdeploy/serve/utils.py | 86 +++++++++++++++++-- lmdeploy/turbomind/turbomind.py | 76 +---------------- 7 files changed, 99 insertions(+), 210 deletions(-) diff --git a/docs/en/advance/long_context.md b/docs/en/advance/long_context.md index 6de6fb6e37..c32dd9352c 100644 --- a/docs/en/advance/long_context.md +++ b/docs/en/advance/long_context.md @@ -96,11 +96,9 @@ This test takes approximately 364 seconds per round when conducted on A100-80G G The following codes demonstrate how to use LMDeploy to calculate perplexity. ```python -from transformers import AutoTokenizer from lmdeploy import TurbomindEngineConfig, pipeline -import numpy as np -# load model and tokenizer +# build pipeline model_repoid_or_path = 'internlm/internlm2_5-7b-chat-1m' backend_config = TurbomindEngineConfig( rope_scaling_factor=2.5, @@ -109,11 +107,9 @@ backend_config = TurbomindEngineConfig( cache_max_entry_count=0.7, tp=4) pipe = pipeline(model_repoid_or_path, backend_config=backend_config) -tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True) # get perplexity text = 'Use a long prompt to replace this sentence' -input_ids = tokenizer.encode(text) -ppl = pipe.get_ppl(input_ids)[0] +ppl = pipe.get_ppl(text) print(ppl) ``` diff --git a/docs/en/llm/pipeline.md b/docs/en/llm/pipeline.md index 5570673576..788b2003dd 100644 --- a/docs/en/llm/pipeline.md +++ b/docs/en/llm/pipeline.md @@ -119,21 +119,21 @@ for item in pipe.stream_infer(prompts, gen_config=gen_config): - **An example to cauculate logits & ppl:** ```python -from transformers import AutoTokenizer from lmdeploy import pipeline + model_repoid_or_path='internlm/internlm2_5-7b-chat' pipe = pipeline(model_repoid_or_path) -tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True) -# logits -messages = [ - {"role": "user", "content": "Hello, how are you?"}, +prompts = [ + "Hello, I am an AI assistant named InternLM. I am developed by Shanghai AI Laboratory", + "How to use LMDeploy to deploy a LLM model?" ] -input_ids = tokenizer.apply_chat_template(messages) -logits = pipe.get_logits(input_ids) + +# logits +logits = pipe.get_logits(prompts) # ppl -ppl = pipe.get_ppl(input_ids) +ppl = pipe.get_ppl(prompts) ``` - **Below is an example for pytorch backend. Please install triton first.** diff --git a/docs/zh_cn/advance/long_context.md b/docs/zh_cn/advance/long_context.md index 407ac607e5..df5fb904e6 100644 --- a/docs/zh_cn/advance/long_context.md +++ b/docs/zh_cn/advance/long_context.md @@ -96,11 +96,9 @@ passkey_retrieval(session_len, 5) 下面展示使用 LMDeploy 计算困惑度的用法 ```python -from transformers import AutoTokenizer from lmdeploy import TurbomindEngineConfig, pipeline -import numpy as np -# load model and tokenizer +# build pipeline model_repoid_or_path = 'internlm/internlm2_5-7b-chat-1m' backend_config = TurbomindEngineConfig( rope_scaling_factor=2.5, @@ -109,11 +107,9 @@ backend_config = TurbomindEngineConfig( cache_max_entry_count=0.7, tp=4) pipe = pipeline(model_repoid_or_path, backend_config=backend_config) -tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True) # get perplexity text = 'Use a long prompt to replace this sentence' -input_ids = tokenizer.encode(text) -loss = pipe.get_ppl(input_ids)[0] +loss = pipe.get_ppl(text) print(ppl) ``` diff --git a/docs/zh_cn/llm/pipeline.md b/docs/zh_cn/llm/pipeline.md index 7963c0cf55..170eff616a 100644 --- a/docs/zh_cn/llm/pipeline.md +++ b/docs/zh_cn/llm/pipeline.md @@ -119,21 +119,21 @@ for item in pipe.stream_infer(prompts, gen_config=gen_config): - **计算 logits & ppl:** ```python -from transformers import AutoTokenizer from lmdeploy import pipeline + model_repoid_or_path='internlm/internlm2_5-7b-chat' pipe = pipeline(model_repoid_or_path) -tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True) -# logits -messages = [ - {"role": "user", "content": "Hello, how are you?"}, +prompts = [ + "Hello, I am an AI assistant named InternLM. I am developed by Shanghai AI Laboratory", + "How to use LMDeploy to deploy a LLM model?" ] -input_ids = tokenizer.apply_chat_template(messages) -logits = pipe.get_logits(input_ids) + +# logits +logits = pipe.get_logits(prompts) # ppl -ppl = pipe.get_ppl(input_ids) +ppl = pipe.get_ppl(prompts) ``` - **使用 pytorch 后端** diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py index cc8b338881..0e8a1ff6b5 100644 --- a/lmdeploy/pytorch/engine/engine_instance.py +++ b/lmdeploy/pytorch/engine/engine_instance.py @@ -1,8 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. -from typing import List, Union - -import torch -from torch.nn.utils.rnn import pad_sequence +from typing import List from lmdeploy.messages import EngineOutput, GenerationConfig from lmdeploy.utils import get_logger @@ -586,97 +583,3 @@ def __add_messages(session_ids, input_ids, adapter_names, self.end(sid) return ret - - def get_ppl(self, input_ids: Union[List[int], List[List[int]]]): - """Get perplexity scores given a list of input tokens. - - Args: - input_ids (Union[List[int], List[List[int]]]): the batch of - input token ids - """ - assert isinstance(input_ids, List) and len(input_ids) > 0 - if isinstance(input_ids[0], int): - input_ids = [input_ids] - assert all(len(_) > 1 for _ in input_ids) - - def get_logits(input_ids, - sequence_start, - sequence_end, - pre_iter_logits=None): - logits = self.decode(input_ids=input_ids, - sequence_start=sequence_start, - sequence_end=sequence_end) - logits = logits.float().cpu() - padding_token_id = -100 - if pre_iter_logits is None: - _logits = logits - target_ids = [(x + [padding_token_id])[1:] for x in input_ids] - target_ids = [ - torch.Tensor(torch.LongTensor(_target_ids)) - for _target_ids in target_ids - ] - else: - # concat the logit of the last token in previous prefill iter, - # and shift the logit of the last token in this iter - _logits = torch.concat((pre_iter_logits[..., -1:, :], logits), - dim=1) - _logits = _logits[..., :-1, :] - target_ids = [ - torch.Tensor(torch.LongTensor(_target_ids)) - for _target_ids in input_ids - ] - target_ids = pad_sequence(target_ids, - batch_first=True, - padding_value=padding_token_id) - target_ids = target_ids.to(logits.device) - target_mask = target_ids != padding_token_id - - # compute cross entropy loss - bsz, seq_len, vocab_size = logits.shape - flat_logits = _logits.contiguous().view(-1, vocab_size) - flat_target_ids = target_ids.contiguous().view(-1) - flat_loss_matrix = torch.nn.functional.cross_entropy( - flat_logits, - flat_target_ids, - reduction='none', - ignore_index=padding_token_id) - return logits, flat_loss_matrix.view(bsz, seq_len), target_mask - - bs = len(input_ids) - max_seq_len = max([len(input_id) for input_id in input_ids]) - - # TODO: a better way to determine `max_input_len` - # At most allocate 2G mem for logits with shape [bs, seq, vocab_size] - vocab_size = self.engine.model_config.vocab_size - max_input_len = 2 * 1024**3 // (bs * vocab_size * 4) - - all_loss_matrix = [] - all_target_mask = [] - # the 1st prefill iter - _input_ids = [input_id[0:max_input_len] for input_id in input_ids] - logits, loss_matrix, target_mask = get_logits(input_ids=_input_ids, - sequence_start=True, - sequence_end=False) - all_loss_matrix.append(loss_matrix) - all_target_mask.append(target_mask) - - # the following prefill iters - for i in range(max_input_len, max_seq_len, max_input_len): - _input_ids = [ - input_id[i:i + max_input_len] for input_id in input_ids - ] - logits, loss_matrix, target_mask = get_logits( - input_ids=_input_ids, - sequence_start=False, - sequence_end=(i + max_input_len >= max_seq_len), - pre_iter_logits=logits) - all_loss_matrix.append(loss_matrix) - all_target_mask.append(target_mask) - - all_loss_matrix = torch.cat(all_loss_matrix, dim=1) - all_target_mask = torch.cat(all_target_mask, dim=1) - target_count = torch.sum(all_target_mask, dim=-1) - loss_sum = torch.sum(all_loss_matrix * all_target_mask, dim=1) - loss_avg = loss_sum / target_count - loss_avg = loss_avg.cpu().numpy() - return loss_avg diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index ef4b28efd9..11ac5e56ea 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -4,6 +4,7 @@ import numpy as np import torch +from torch.nn.utils.rnn import pad_sequence from lmdeploy.utils import get_logger @@ -64,7 +65,7 @@ def prepare_inputs(self, prompts: Union[PromptType, List[PromptType]]): def get_logits( self, - input_ids: Union[InputIdsType, List[InputIdsType]], + inputs: Union[str, List[str]], input_embeddings: Union[InputEmbsType, List[InputEmbsType]] = None, input_embedding_ranges: Union[InputEmbRngsType, List[InputEmbRngsType]] = None): @@ -74,13 +75,17 @@ def get_logits( input_ids (Union[List[int], List[List[int]]]): the batch of input token ids """ - assert len(input_ids) > 0 - if isinstance(input_ids[0], int): - input_ids = [input_ids] - for input_id in input_ids: - assert len(input_id) > 0 + if isinstance(inputs, str): + inputs = [inputs] + assert all(len(_) > 0 for _ in inputs) + + input_ids = [self.tokenizer.encode(text) for text in inputs] + bs = len(input_ids) + # TODO: a better way to determine `max_input_len`, at most allocate + # 2G mem for logits with shape [bs, max_input_len, vocab_size] + vocab_size = self.hf_tm_cfg.vocab_size + max_input_len = 2 * 1024**3 // (bs * vocab_size * 4) - max_input_len = self.backend_config.max_prefill_token_num n_max_iter = np.ceil( max([len(input_id) for input_id in input_ids]) / max_input_len).astype(int) @@ -183,6 +188,69 @@ def get_ppl(self, inputs: List[str]) -> List[float]: """ if isinstance(inputs, str): inputs = [inputs] - input_ids = [self.tokenizer.encode(text) for text in inputs] + assert all(len(_) > 0 for _ in inputs) + generator = self.engine.create_instance() - return generator.get_ppl(input_ids) + input_ids = [self.tokenizer.encode(text) for text in inputs] + + bs = len(input_ids) + max_seq_len = len(input_ids[0]) + + # TODO: a better way to determine `max_input_len`, at most allocate + # 2G mem for logits with shape [bs, max_input_len, vocab_size] + vocab_size = self.hf_tm_cfg.vocab_size + max_input_len = 2 * 1024**3 // (bs * vocab_size * 4) + + all_loss_matrix = [] + all_target_mask = [] + for i in range(0, max_seq_len, max_input_len): + token_ids = [ + input_id[i:i + max_input_len] for input_id in input_ids + ] + steps = [i] * bs + logits = generator.decode( + token_ids, + steps=steps, + sequence_start=(i == 0), + sequence_end=(i + max_input_len >= max_seq_len)) + bsz, seq_len, vocab_size = logits.shape + logits = logits.float().cpu() + padding_token_id = -100 + # meaning logits[..., :, :] corresponds to labels + # token_ids[1:] + predict_token_id, which is + # input_ids[:, i+max_input_len:i+max_input_len+1] + target_ids = [ + input_id[i + 1:i + 1 + max_input_len] for input_id in input_ids + ] + if len(target_ids[0]) < len(token_ids[0]): + target_ids = [x + [padding_token_id] for x in target_ids] + target_ids = [ + torch.Tensor(torch.LongTensor(_target_ids)) + for _target_ids in target_ids + ] + target_ids = pad_sequence(target_ids, + batch_first=True, + padding_value=padding_token_id) + target_ids = target_ids.to(logits.device) + target_mask = target_ids != padding_token_id + + # compute cross entropy loss + flat_logits = logits.contiguous().view(-1, vocab_size) + flat_target_ids = target_ids.contiguous().view(-1) + flat_loss_matrix = torch.nn.functional.cross_entropy( + flat_logits, + flat_target_ids, + reduction='none', + ignore_index=padding_token_id) + + all_loss_matrix.append(flat_loss_matrix.view(bsz, seq_len)) + all_target_mask.append(target_mask) + + all_loss_matrix = torch.cat(all_loss_matrix, dim=1) + all_target_mask = torch.cat(all_target_mask, dim=1) + target_count = torch.sum(all_target_mask, dim=-1) + loss_sum = torch.sum(all_loss_matrix * all_target_mask, dim=1) + loss_avg = loss_sum / target_count + loss_avg = loss_avg.cpu().numpy() + + return loss_avg diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py index ab23649f72..0e41849ccc 100644 --- a/lmdeploy/turbomind/turbomind.py +++ b/lmdeploy/turbomind/turbomind.py @@ -8,7 +8,7 @@ from dataclasses import asdict from itertools import repeat from queue import LifoQueue, Queue -from typing import Dict, Iterable, List, Union +from typing import Dict, Iterable, List import numpy as np import torch @@ -902,77 +902,3 @@ def _broadcast_np(data, dtype, shape=(batch_size, )): logits = outputs['logits'] return logits[:, :-1, :] - - def get_ppl(self, input_ids: Union[List[int], List[List[int]]]): - """Get perplexity scores given a list of input tokens. - - Args: - input_ids (Union[List[int], List[List[int]]]): the batch of - input token ids - """ - - assert isinstance(input_ids, List) and len(input_ids) > 0 - if isinstance(input_ids[0], int): - input_ids = [input_ids] - assert all(len(_) > 1 for _ in input_ids) - - bs = len(input_ids) - max_seq_len = max([len(input_id) for input_id in input_ids]) - - # TODO: a better way to determine `max_input_len` - # At most allocate 2G mem for logits with shape [bs, seq, vocab_size] - vocab_size = self.config.vocab_size - max_input_len = 2 * 1024**3 // (bs * vocab_size * 4) - - all_loss_matrix = [] - all_target_mask = [] - # suppose input_ids is [0,1,2,3,4,5,6,7,8], and max_input_len=5 - # In the first iter, tokens [0,1,2,3,4] are prefilled. - # loss=cross_entropy(logits[..., :-1, :], token_ids[1,2,3,4]) - # In the 2nd iter, token [4,5,6,7,8] should be prefilled. - # The first token must be the latest one in prev iter, because - # token_ids (or labels) have to be shifted the mostleft token - # loss=cross_entropy(logits[..., :-1, :], token_ids[5,6,7,8]) - for i in range(0, max_seq_len, max_input_len - 1): - _input_ids = [ - input_id[i:i + max_input_len] for input_id in input_ids - ] - steps = [i] * bs - _logits = self.decode( - _input_ids, - steps=steps, - sequence_start=(i == 0), - sequence_end=(i + max_input_len >= max_seq_len)) - _logits = _logits.float().cpu() - padding_token_id = -100 - target_ids = [(x + [padding_token_id])[1:] for x in _input_ids] - target_ids = [ - torch.Tensor(torch.LongTensor(_target_ids)) - for _target_ids in target_ids - ] - target_ids = pad_sequence(target_ids, - batch_first=True, - padding_value=padding_token_id) - target_ids = target_ids.to(_logits.device) - target_mask = target_ids != padding_token_id - target_count = torch.sum(target_mask, dim=-1) - # compute cross entropy loss - bsz, seq_len, vocab_size = _logits.shape - flat_logits = _logits.contiguous().view(-1, vocab_size) - flat_target_ids = target_ids.contiguous().view(-1) - flat_loss_matrix = torch.nn.functional.cross_entropy( - flat_logits, - flat_target_ids, - reduction='none', - ignore_index=padding_token_id) - - all_loss_matrix.append(flat_loss_matrix.view(bsz, seq_len)) - all_target_mask.append(target_mask) - - all_loss_matrix = torch.cat(all_loss_matrix, dim=1) - all_target_mask = torch.cat(all_target_mask, dim=1) - target_count = torch.sum(all_target_mask, dim=-1) - loss_sum = torch.sum(all_loss_matrix * all_target_mask, dim=1) - loss_avg = loss_sum / target_count - loss_avg = loss_avg.cpu().numpy() - return loss_avg From 95b7ef278864f05bbbf6e6dad2bb22e92279797b Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 24 Sep 2024 17:50:54 +0800 Subject: [PATCH 06/13] fix --- lmdeploy/serve/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 11ac5e56ea..a18d8c20fe 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -194,7 +194,7 @@ def get_ppl(self, inputs: List[str]) -> List[float]: input_ids = [self.tokenizer.encode(text) for text in inputs] bs = len(input_ids) - max_seq_len = len(input_ids[0]) + max_seq_len = max([len(_) for _ in input_ids]) # TODO: a better way to determine `max_input_len`, at most allocate # 2G mem for logits with shape [bs, max_input_len, vocab_size] @@ -222,8 +222,11 @@ def get_ppl(self, inputs: List[str]) -> List[float]: target_ids = [ input_id[i + 1:i + 1 + max_input_len] for input_id in input_ids ] - if len(target_ids[0]) < len(token_ids[0]): - target_ids = [x + [padding_token_id] for x in target_ids] + target_ids = [ + target_ids[i] + [padding_token_id] + if len(target_ids[i]) < len(token_ids[i]) else target_ids[i] + for i in range(bsz) + ] target_ids = [ torch.Tensor(torch.LongTensor(_target_ids)) for _target_ids in target_ids From e49b964734a4d7d9da2dfec5982fe47b39e3eeb6 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 24 Sep 2024 18:53:47 +0800 Subject: [PATCH 07/13] update --- lmdeploy/serve/utils.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index a18d8c20fe..58837bca6d 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -201,8 +201,8 @@ def get_ppl(self, inputs: List[str]) -> List[float]: vocab_size = self.hf_tm_cfg.vocab_size max_input_len = 2 * 1024**3 // (bs * vocab_size * 4) - all_loss_matrix = [] - all_target_mask = [] + losses = [] + target_counts = [] for i in range(0, max_seq_len, max_input_len): token_ids = [ input_id[i:i + max_input_len] for input_id in input_ids @@ -245,14 +245,13 @@ def get_ppl(self, inputs: List[str]) -> List[float]: flat_target_ids, reduction='none', ignore_index=padding_token_id) + flat_loss_matrix = flat_loss_matrix.view(bsz, seq_len) + losses.append(flat_loss_matrix.sum(dim=-1).view(bsz, -1)) + target_counts.append(target_mask.sum(dim=-1).view(bsz, -1)) - all_loss_matrix.append(flat_loss_matrix.view(bsz, seq_len)) - all_target_mask.append(target_mask) + target_count = torch.concatenate(target_counts, dim=-1).sum(dim=-1) + loss_sum = torch.concatenate(losses, dim=-1).sum(dim=-1) - all_loss_matrix = torch.cat(all_loss_matrix, dim=1) - all_target_mask = torch.cat(all_target_mask, dim=1) - target_count = torch.sum(all_target_mask, dim=-1) - loss_sum = torch.sum(all_loss_matrix * all_target_mask, dim=1) loss_avg = loss_sum / target_count loss_avg = loss_avg.cpu().numpy() From 28b0258086e61d0a440da04edd06fb0416262316 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 24 Sep 2024 18:58:58 +0800 Subject: [PATCH 08/13] keep logits.device unchanged --- lmdeploy/serve/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 58837bca6d..56d5bd5145 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -209,12 +209,12 @@ def get_ppl(self, inputs: List[str]) -> List[float]: ] steps = [i] * bs logits = generator.decode( - token_ids, + input_ids=token_ids, steps=steps, sequence_start=(i == 0), sequence_end=(i + max_input_len >= max_seq_len)) bsz, seq_len, vocab_size = logits.shape - logits = logits.float().cpu() + logits = logits.float() padding_token_id = -100 # meaning logits[..., :, :] corresponds to labels # token_ids[1:] + predict_token_id, which is From 4e7432dd559f80f60290c5050235b7014232ba14 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Tue, 24 Sep 2024 23:31:52 +0800 Subject: [PATCH 09/13] require input_ids have the same length --- lmdeploy/serve/utils.py | 52 ++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 56d5bd5145..438b6b25ac 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -65,7 +65,7 @@ def prepare_inputs(self, prompts: Union[PromptType, List[PromptType]]): def get_logits( self, - inputs: Union[str, List[str]], + input_ids: Union[InputIdsType, List[InputIdsType]], input_embeddings: Union[InputEmbsType, List[InputEmbsType]] = None, input_embedding_ranges: Union[InputEmbRngsType, List[InputEmbRngsType]] = None): @@ -75,11 +75,20 @@ def get_logits( input_ids (Union[List[int], List[List[int]]]): the batch of input token ids """ - if isinstance(inputs, str): - inputs = [inputs] - assert all(len(_) > 0 for _ in inputs) - input_ids = [self.tokenizer.encode(text) for text in inputs] + if isinstance(input_ids, InputIdsType): + input_ids = [input_ids] + if isinstance(input_embeddings, InputEmbsType): + input_embeddings = [input_embeddings] + if isinstance[input_embedding_ranges, InputEmbRngsType]: + input_embedding_ranges = [input_embedding_ranges] + + assert all(len(_) == len(input_ids[0]) for _ in input_ids), \ + 'the list of input_ids must have the same length' + + if input_embeddings is None or input_embedding_ranges is None: + assert input_embeddings is None and input_embedding_ranges is None + bs = len(input_ids) # TODO: a better way to determine `max_input_len`, at most allocate # 2G mem for logits with shape [bs, max_input_len, vocab_size] @@ -177,21 +186,30 @@ def _split_embeddings(input_ids, niter, iter_len, embeddings, logits = torch.cat(logits, dim=1) return logits - def get_ppl(self, inputs: List[str]) -> List[float]: - """Get perplexity scores given a list of inputs. + def get_ppl( + self, input_ids: Union[List[int], + List[List[int]]]) -> Union[float, List[float]]: + """Get perplexity scores given a list of input tokens that have to be + of the same length. Args: - inputs (List[str]): A list of strings. + input_ids (Union[List[int], List[List[int]]]): the batch of + input token ids Returns: - List[float]: A list of perplexity scores. + Union[float, List[float]]: A list of perplexity scores. """ - if isinstance(inputs, str): - inputs = [inputs] - assert all(len(_) > 0 for _ in inputs) + assert isinstance(input_ids, List) + if isinstance(input_ids[0], int): + input_ids = [input_ids] + + # In case of input_ids of unequal length, some list might be empty + # during multi-iter prefilling, which causes shape error if the + # inference engine is pytorch engine + assert all(len(_) == len(input_ids[0]) for _ in input_ids), \ + 'the list of input_ids must have the same length' generator = self.engine.create_instance() - input_ids = [self.tokenizer.encode(text) for text in inputs] bs = len(input_ids) max_seq_len = max([len(_) for _ in input_ids]) @@ -246,13 +264,15 @@ def get_ppl(self, inputs: List[str]) -> List[float]: reduction='none', ignore_index=padding_token_id) flat_loss_matrix = flat_loss_matrix.view(bsz, seq_len) - losses.append(flat_loss_matrix.sum(dim=-1).view(bsz, -1)) - target_counts.append(target_mask.sum(dim=-1).view(bsz, -1)) + loss = flat_loss_matrix.sum(dim=-1).cpu().view(bsz, -1) + target_count = target_mask.sum(dim=-1).cpu().view(bsz, -1) + losses.append(loss) + target_counts.append(target_count) target_count = torch.concatenate(target_counts, dim=-1).sum(dim=-1) loss_sum = torch.concatenate(losses, dim=-1).sum(dim=-1) loss_avg = loss_sum / target_count - loss_avg = loss_avg.cpu().numpy() + loss_avg = loss_avg.numpy() return loss_avg From a1a4845fc8ffdbe2459d8d4ca4b72bf3bfa143fa Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Wed, 25 Sep 2024 11:48:43 +0800 Subject: [PATCH 10/13] rollback user guide --- docs/en/advance/long_context.md | 8 ++++++-- docs/en/llm/pipeline.md | 16 ++++++++-------- docs/zh_cn/advance/long_context.md | 8 ++++++-- docs/zh_cn/llm/pipeline.md | 16 ++++++++-------- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/docs/en/advance/long_context.md b/docs/en/advance/long_context.md index c32dd9352c..6de6fb6e37 100644 --- a/docs/en/advance/long_context.md +++ b/docs/en/advance/long_context.md @@ -96,9 +96,11 @@ This test takes approximately 364 seconds per round when conducted on A100-80G G The following codes demonstrate how to use LMDeploy to calculate perplexity. ```python +from transformers import AutoTokenizer from lmdeploy import TurbomindEngineConfig, pipeline +import numpy as np -# build pipeline +# load model and tokenizer model_repoid_or_path = 'internlm/internlm2_5-7b-chat-1m' backend_config = TurbomindEngineConfig( rope_scaling_factor=2.5, @@ -107,9 +109,11 @@ backend_config = TurbomindEngineConfig( cache_max_entry_count=0.7, tp=4) pipe = pipeline(model_repoid_or_path, backend_config=backend_config) +tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True) # get perplexity text = 'Use a long prompt to replace this sentence' -ppl = pipe.get_ppl(text) +input_ids = tokenizer.encode(text) +ppl = pipe.get_ppl(input_ids)[0] print(ppl) ``` diff --git a/docs/en/llm/pipeline.md b/docs/en/llm/pipeline.md index 617ebc4f09..ab4035a8cc 100644 --- a/docs/en/llm/pipeline.md +++ b/docs/en/llm/pipeline.md @@ -119,21 +119,21 @@ for item in pipe.stream_infer(prompts, gen_config=gen_config): - **An example to cauculate logits & ppl:** ```python +from transformers import AutoTokenizer from lmdeploy import pipeline - model_repoid_or_path='internlm/internlm2_5-7b-chat' pipe = pipeline(model_repoid_or_path) - -prompts = [ - "Hello, I am an AI assistant named InternLM. I am developed by Shanghai AI Laboratory", - "How to use LMDeploy to deploy a LLM model?" -] +tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True) # logits -logits = pipe.get_logits(prompts) +messages = [ + {"role": "user", "content": "Hello, how are you?"}, +] +input_ids = tokenizer.apply_chat_template(messages) +logits = pipe.get_logits(input_ids) # ppl -ppl = pipe.get_ppl(prompts) +ppl = pipe.get_ppl(input_ids) ``` - **Below is an example for pytorch backend. Please install triton first.** diff --git a/docs/zh_cn/advance/long_context.md b/docs/zh_cn/advance/long_context.md index df5fb904e6..407ac607e5 100644 --- a/docs/zh_cn/advance/long_context.md +++ b/docs/zh_cn/advance/long_context.md @@ -96,9 +96,11 @@ passkey_retrieval(session_len, 5) 下面展示使用 LMDeploy 计算困惑度的用法 ```python +from transformers import AutoTokenizer from lmdeploy import TurbomindEngineConfig, pipeline +import numpy as np -# build pipeline +# load model and tokenizer model_repoid_or_path = 'internlm/internlm2_5-7b-chat-1m' backend_config = TurbomindEngineConfig( rope_scaling_factor=2.5, @@ -107,9 +109,11 @@ backend_config = TurbomindEngineConfig( cache_max_entry_count=0.7, tp=4) pipe = pipeline(model_repoid_or_path, backend_config=backend_config) +tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True) # get perplexity text = 'Use a long prompt to replace this sentence' -loss = pipe.get_ppl(text) +input_ids = tokenizer.encode(text) +loss = pipe.get_ppl(input_ids)[0] print(ppl) ``` diff --git a/docs/zh_cn/llm/pipeline.md b/docs/zh_cn/llm/pipeline.md index 30c39093bd..a9c74a5f14 100644 --- a/docs/zh_cn/llm/pipeline.md +++ b/docs/zh_cn/llm/pipeline.md @@ -119,21 +119,21 @@ for item in pipe.stream_infer(prompts, gen_config=gen_config): - **计算 logits & ppl:** ```python +from transformers import AutoTokenizer from lmdeploy import pipeline - model_repoid_or_path='internlm/internlm2_5-7b-chat' pipe = pipeline(model_repoid_or_path) - -prompts = [ - "Hello, I am an AI assistant named InternLM. I am developed by Shanghai AI Laboratory", - "How to use LMDeploy to deploy a LLM model?" -] +tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True) # logits -logits = pipe.get_logits(prompts) +messages = [ + {"role": "user", "content": "Hello, how are you?"}, +] +input_ids = tokenizer.apply_chat_template(messages) +logits = pipe.get_logits(input_ids) # ppl -ppl = pipe.get_ppl(prompts) +ppl = pipe.get_ppl(input_ids) ``` - **使用 pytorch 后端** From c64c00f917a03f438e969800fbadda7f2c59961d Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Wed, 25 Sep 2024 18:14:04 +0800 Subject: [PATCH 11/13] update --- lmdeploy/serve/utils.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 438b6b25ac..41633f38bf 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -75,19 +75,11 @@ def get_logits( input_ids (Union[List[int], List[List[int]]]): the batch of input token ids """ - - if isinstance(input_ids, InputIdsType): + assert len(input_ids) > 0 + if isinstance(input_ids[0], int): input_ids = [input_ids] - if isinstance(input_embeddings, InputEmbsType): - input_embeddings = [input_embeddings] - if isinstance[input_embedding_ranges, InputEmbRngsType]: - input_embedding_ranges = [input_embedding_ranges] - - assert all(len(_) == len(input_ids[0]) for _ in input_ids), \ - 'the list of input_ids must have the same length' - - if input_embeddings is None or input_embedding_ranges is None: - assert input_embeddings is None and input_embedding_ranges is None + for input_id in input_ids: + assert len(input_id) > 0 bs = len(input_ids) # TODO: a better way to determine `max_input_len`, at most allocate @@ -203,12 +195,6 @@ def get_ppl( if isinstance(input_ids[0], int): input_ids = [input_ids] - # In case of input_ids of unequal length, some list might be empty - # during multi-iter prefilling, which causes shape error if the - # inference engine is pytorch engine - assert all(len(_) == len(input_ids[0]) for _ in input_ids), \ - 'the list of input_ids must have the same length' - generator = self.engine.create_instance() bs = len(input_ids) From 4066eb52a67e61eab1b82a60c76ba649f9b38973 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 26 Sep 2024 00:13:55 +0800 Subject: [PATCH 12/13] split batch dim --- lmdeploy/serve/utils.py | 173 +++++++++++++++++++++++++++------------- 1 file changed, 116 insertions(+), 57 deletions(-) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 41633f38bf..44476a958b 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -178,9 +178,8 @@ def _split_embeddings(input_ids, niter, iter_len, embeddings, logits = torch.cat(logits, dim=1) return logits - def get_ppl( - self, input_ids: Union[List[int], - List[List[int]]]) -> Union[float, List[float]]: + def get_ppl(self, input_ids: Union[List[int], + List[List[int]]]) -> List[float]: """Get perplexity scores given a list of input tokens that have to be of the same length. @@ -197,68 +196,128 @@ def get_ppl( generator = self.engine.create_instance() - bs = len(input_ids) - max_seq_len = max([len(_) for _ in input_ids]) - # TODO: a better way to determine `max_input_len`, at most allocate # 2G mem for logits with shape [bs, max_input_len, vocab_size] vocab_size = self.hf_tm_cfg.vocab_size - max_input_len = 2 * 1024**3 // (bs * vocab_size * 4) + max_input_len = 2 * 1024**3 // (vocab_size * 4) + sizes = [len(_) for _ in input_ids] + losses = [] + target_counts = [] + for (start, end) in self._batch_iterator(sizes, max_input_len): + if start == end: + loss, target_count = self._get_long_text_ppl( + generator=generator, + input_ids=input_ids[start], + max_input_len=max_input_len) + losses.append(loss) + target_counts.append(target_count) + else: + loss, target_count = self._get_ppl( + generator=generator, + input_ids=input_ids[start:end], + max_input_len=max_input_len, + ) + losses.append(loss) + target_counts.append(target_count) + loss = torch.concatenate(losses) + target_count = torch.concatenate(target_counts) + loss_avg = loss / target_count + loss_avg = loss_avg.numpy() + return loss_avg + def _batch_iterator(self, sizes, max_length): + i = 0 + while i < len(sizes): + current_sum = 0 + start_index = i + + while i < len(sizes) and current_sum + sizes[i] <= max_length: + current_sum += sizes[i] + i += 1 + + yield (start_index, i) + if i > start_index: + continue + else: + i += 1 + + def _get_long_text_ppl(self, generator, input_ids, max_input_len): + assert all(isinstance(_, int) for _ in input_ids) + assert len(input_ids) > max_input_len + + seq_len = len(input_ids) losses = [] target_counts = [] - for i in range(0, max_seq_len, max_input_len): - token_ids = [ - input_id[i:i + max_input_len] for input_id in input_ids - ] - steps = [i] * bs - logits = generator.decode( - input_ids=token_ids, - steps=steps, + for i in range(0, len(input_ids), max_input_len): + token_ids = input_ids[i:i + max_input_len] + step = i + # shift token_ids by 1 to the left + target_ids = input_ids[i + 1:i + 1 + max_input_len] + + loss, target_count = self._get_ppl( + generator=generator, + input_ids=[token_ids], + max_input_len=max_input_len, + target_ids=[target_ids], + steps=[step], sequence_start=(i == 0), - sequence_end=(i + max_input_len >= max_seq_len)) - bsz, seq_len, vocab_size = logits.shape - logits = logits.float() - padding_token_id = -100 - # meaning logits[..., :, :] corresponds to labels - # token_ids[1:] + predict_token_id, which is - # input_ids[:, i+max_input_len:i+max_input_len+1] - target_ids = [ - input_id[i + 1:i + 1 + max_input_len] for input_id in input_ids - ] + sequence_end=(i + max_input_len >= seq_len)) + losses.append(loss) + target_counts.append(target_count) + loss_sum = torch.concatenate(losses).sum().unsqueeze(0) + target_count = torch.concatenate(target_counts).sum().unsqueeze(0) + return loss_sum, target_count + + def _get_ppl(self, + generator, + input_ids, + max_input_len, + target_ids=None, + steps=None, + sequence_start: bool = True, + sequence_end: bool = True): + assert isinstance(input_ids, List) + assert all(isinstance(_, List) for _ in input_ids) + assert sum(len(_) for _ in input_ids) <= max_input_len + if target_ids: + assert all(isinstance(_, List) for _ in target_ids) + + logger.info(f'get_ppl batch_size {len(input_ids)}') + logits = generator.decode(input_ids=input_ids, + steps=steps, + sequence_start=sequence_start, + sequence_end=sequence_end) + bsz, seq_len, vocab_size = logits.shape + logits = logits.float() + padding_token_id = -100 + if target_ids is None: + # shift token_ids by 1 to the left + target_ids = [x[1:] + [padding_token_id] for x in input_ids] + else: target_ids = [ target_ids[i] + [padding_token_id] - if len(target_ids[i]) < len(token_ids[i]) else target_ids[i] + if len(target_ids[i]) < len(input_ids[i]) else target_ids[i] for i in range(bsz) ] - target_ids = [ - torch.Tensor(torch.LongTensor(_target_ids)) - for _target_ids in target_ids - ] - target_ids = pad_sequence(target_ids, - batch_first=True, - padding_value=padding_token_id) - target_ids = target_ids.to(logits.device) - target_mask = target_ids != padding_token_id - - # compute cross entropy loss - flat_logits = logits.contiguous().view(-1, vocab_size) - flat_target_ids = target_ids.contiguous().view(-1) - flat_loss_matrix = torch.nn.functional.cross_entropy( - flat_logits, - flat_target_ids, - reduction='none', - ignore_index=padding_token_id) - flat_loss_matrix = flat_loss_matrix.view(bsz, seq_len) - loss = flat_loss_matrix.sum(dim=-1).cpu().view(bsz, -1) - target_count = target_mask.sum(dim=-1).cpu().view(bsz, -1) - losses.append(loss) - target_counts.append(target_count) - - target_count = torch.concatenate(target_counts, dim=-1).sum(dim=-1) - loss_sum = torch.concatenate(losses, dim=-1).sum(dim=-1) - - loss_avg = loss_sum / target_count - loss_avg = loss_avg.numpy() - - return loss_avg + target_ids = [ + torch.Tensor(torch.LongTensor(_target_ids)) + for _target_ids in target_ids + ] + target_ids = pad_sequence(target_ids, + batch_first=True, + padding_value=padding_token_id) + target_ids = target_ids.to(logits.device) + target_mask = target_ids != padding_token_id + + # compute cross entropy loss + flat_logits = logits.contiguous().view(-1, vocab_size) + flat_target_ids = target_ids.contiguous().view(-1) + flat_loss_matrix = torch.nn.functional.cross_entropy( + flat_logits, + flat_target_ids, + reduction='none', + ignore_index=padding_token_id) + flat_loss_matrix = flat_loss_matrix.view(bsz, seq_len) + loss = flat_loss_matrix.sum(dim=-1).cpu() + target_count = target_mask.sum(dim=-1).cpu() + return loss, target_count From c7607943f31376c14d06fe66fe7794dad19b2a71 Mon Sep 17 00:00:00 2001 From: lvhan028 Date: Thu, 26 Sep 2024 14:10:25 +0800 Subject: [PATCH 13/13] apply torch.cuda.empty_cache() --- lmdeploy/serve/utils.py | 65 ++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py index 44476a958b..4791d3c724 100644 --- a/lmdeploy/serve/utils.py +++ b/lmdeploy/serve/utils.py @@ -203,18 +203,27 @@ def get_ppl(self, input_ids: Union[List[int], sizes = [len(_) for _ in input_ids] losses = [] target_counts = [] + sorted_index_values = sorted(list(enumerate(sizes)), + key=lambda x: x[1], + reverse=True) + sizes = [value for index, value in sorted_index_values] + indices = [index for index, value in sorted_index_values] + logger.info(f'sorted sizes: {sizes}') + logger.info(f'sorted indices: {indices}') for (start, end) in self._batch_iterator(sizes, max_input_len): + logger.info(f'start: {start}, end: {end}') + _input_ids = [input_ids[indices[i]] for i in range(start, end)] if start == end: loss, target_count = self._get_long_text_ppl( generator=generator, - input_ids=input_ids[start], + input_ids=_input_ids, max_input_len=max_input_len) losses.append(loss) target_counts.append(target_count) else: loss, target_count = self._get_ppl( generator=generator, - input_ids=input_ids[start:end], + input_ids=_input_ids, max_input_len=max_input_len, ) losses.append(loss) @@ -222,17 +231,27 @@ def get_ppl(self, input_ids: Union[List[int], loss = torch.concatenate(losses) target_count = torch.concatenate(target_counts) loss_avg = loss / target_count - loss_avg = loss_avg.numpy() - return loss_avg - - def _batch_iterator(self, sizes, max_length): + loss_avg = loss_avg.numpy().tolist() + result = list(range(len(loss_avg))) + for index, sorted_index in enumerate(indices): + result[sorted_index] = loss_avg[index] + return result + + def _batch_iterator(self, sizes, max_value): + """Return an iterator that calculates intervals (start, end) of a + descend-order list, in which the sum of values in the range is the + maximum number not less than max_value. By "the sum of values", + + here it means $$len(sizes[start:end]) * sizes[start]$$ + """ i = 0 while i < len(sizes): current_sum = 0 start_index = i - while i < len(sizes) and current_sum + sizes[i] <= max_length: - current_sum += sizes[i] + while i < len( + sizes) and current_sum + sizes[start_index] <= max_value: + current_sum += sizes[start_index] i += 1 yield (start_index, i) @@ -242,24 +261,25 @@ def _batch_iterator(self, sizes, max_length): i += 1 def _get_long_text_ppl(self, generator, input_ids, max_input_len): - assert all(isinstance(_, int) for _ in input_ids) - assert len(input_ids) > max_input_len + assert isinstance(input_ids, List) and len(input_ids) == 1 + seq_len = len(input_ids[0]) + assert seq_len > max_input_len + logger.info(f'get long text ppl: seq_len {seq_len}') - seq_len = len(input_ids) losses = [] target_counts = [] - for i in range(0, len(input_ids), max_input_len): - token_ids = input_ids[i:i + max_input_len] - step = i + for i in range(0, seq_len, max_input_len): + token_ids = input_ids[:, i:i + max_input_len] + step = [i] # shift token_ids by 1 to the left - target_ids = input_ids[i + 1:i + 1 + max_input_len] + target_ids = input_ids[:, i + 1:i + 1 + max_input_len] loss, target_count = self._get_ppl( generator=generator, - input_ids=[token_ids], + input_ids=token_ids, max_input_len=max_input_len, - target_ids=[target_ids], - steps=[step], + target_ids=target_ids, + steps=step, sequence_start=(i == 0), sequence_end=(i + max_input_len >= seq_len)) losses.append(loss) @@ -278,11 +298,16 @@ def _get_ppl(self, sequence_end: bool = True): assert isinstance(input_ids, List) assert all(isinstance(_, List) for _ in input_ids) - assert sum(len(_) for _ in input_ids) <= max_input_len if target_ids: assert all(isinstance(_, List) for _ in target_ids) - logger.info(f'get_ppl batch_size {len(input_ids)}') + lens = [len(_) for _ in input_ids] + total_len = sum(lens) + assert sum(lens) <= max_input_len + + logger.info(f'get_ppl: bs: {len(input_ids)}, lens: {lens}, ' + f'total_len: {total_len}') + torch.cuda.empty_cache() logits = generator.decode(input_ids=input_ids, steps=steps, sequence_start=sequence_start,