Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

The get_ppl missed the last token of each iteration during multi-iter prefill #2499

Merged
merged 14 commits into from
Sep 26, 2024
8 changes: 2 additions & 6 deletions docs/en/advance/long_context.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,9 @@ This test takes approximately 364 seconds per round when conducted on A100-80G G
The following codes demonstrate how to use LMDeploy to calculate perplexity.

```python
from transformers import AutoTokenizer
from lmdeploy import TurbomindEngineConfig, pipeline
import numpy as np

# load model and tokenizer
# build pipeline
model_repoid_or_path = 'internlm/internlm2_5-7b-chat-1m'
backend_config = TurbomindEngineConfig(
rope_scaling_factor=2.5,
Expand All @@ -109,11 +107,9 @@ backend_config = TurbomindEngineConfig(
cache_max_entry_count=0.7,
tp=4)
pipe = pipeline(model_repoid_or_path, backend_config=backend_config)
tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True)

# get perplexity
text = 'Use a long prompt to replace this sentence'
input_ids = tokenizer.encode(text)
ppl = pipe.get_ppl(input_ids)[0]
ppl = pipe.get_ppl(text)
print(ppl)
```
16 changes: 8 additions & 8 deletions docs/en/llm/pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,21 +119,21 @@ for item in pipe.stream_infer(prompts, gen_config=gen_config):
- **An example to cauculate logits & ppl:**

```python
from transformers import AutoTokenizer
from lmdeploy import pipeline

model_repoid_or_path='internlm/internlm2_5-7b-chat'
pipe = pipeline(model_repoid_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True)

# logits
messages = [
{"role": "user", "content": "Hello, how are you?"},
prompts = [
"Hello, I am an AI assistant named InternLM. I am developed by Shanghai AI Laboratory",
"How to use LMDeploy to deploy a LLM model?"
]
input_ids = tokenizer.apply_chat_template(messages)
logits = pipe.get_logits(input_ids)

# logits
logits = pipe.get_logits(prompts)

# ppl
ppl = pipe.get_ppl(input_ids)
ppl = pipe.get_ppl(prompts)
```

- **Below is an example for pytorch backend. Please install triton first.**
Expand Down
8 changes: 2 additions & 6 deletions docs/zh_cn/advance/long_context.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,9 @@ passkey_retrieval(session_len, 5)
下面展示使用 LMDeploy 计算困惑度的用法

```python
from transformers import AutoTokenizer
from lmdeploy import TurbomindEngineConfig, pipeline
import numpy as np

# load model and tokenizer
# build pipeline
model_repoid_or_path = 'internlm/internlm2_5-7b-chat-1m'
backend_config = TurbomindEngineConfig(
rope_scaling_factor=2.5,
Expand All @@ -109,11 +107,9 @@ backend_config = TurbomindEngineConfig(
cache_max_entry_count=0.7,
tp=4)
pipe = pipeline(model_repoid_or_path, backend_config=backend_config)
tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True)

# get perplexity
text = 'Use a long prompt to replace this sentence'
input_ids = tokenizer.encode(text)
loss = pipe.get_ppl(input_ids)[0]
loss = pipe.get_ppl(text)
print(ppl)
```
16 changes: 8 additions & 8 deletions docs/zh_cn/llm/pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,21 +119,21 @@ for item in pipe.stream_infer(prompts, gen_config=gen_config):
- **计算 logits & ppl:**

```python
from transformers import AutoTokenizer
from lmdeploy import pipeline

model_repoid_or_path='internlm/internlm2_5-7b-chat'
pipe = pipeline(model_repoid_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True)

# logits
messages = [
{"role": "user", "content": "Hello, how are you?"},
prompts = [
"Hello, I am an AI assistant named InternLM. I am developed by Shanghai AI Laboratory",
"How to use LMDeploy to deploy a LLM model?"
]
input_ids = tokenizer.apply_chat_template(messages)
logits = pipe.get_logits(input_ids)

# logits
logits = pipe.get_logits(prompts)

# ppl
ppl = pipe.get_ppl(input_ids)
ppl = pipe.get_ppl(prompts)
```

- **使用 pytorch 后端**
Expand Down
31 changes: 0 additions & 31 deletions lmdeploy/pytorch/engine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1046,34 +1046,3 @@ async def async_end(self, session_id: int):
def end(self, session_id: int):
"""Add new session."""
return self.engine_instance.end(session_id)

def decode(self,
input_ids,
input_embeddings: List[InputEmbeddingType] = None,
input_embedding_ranges: List[InputEmbeddingRangeType] = None,
steps: List[int] = None,
sequence_start: bool = True,
sequence_end: bool = True,
adapter_names: List[str] = None):
"""Perform context decode on input tokens.

Args:
input_ids (List[List[int]] | List[np.ndaray]): the batch of input
token ids
steps (List[int]): the offset of the k/v cache
input_embeddings (List[List[Union[torch.Tensor, np.ndarray]]]):
embeddings features
input_embedding_ranges: (List[List[Tuple[int, int]]]):
the begin/end offsets of input_embeddings to input_ids
sequence_start (bool): indicator for starting a sequence
sequence_end (bool): indicator for ending a sequence
adapter_names (List[str]): The name of the adapters.
"""
return self.engine_instance.decode(
input_ids,
input_embeddings=input_embeddings,
input_embedding_ranges=input_embedding_ranges,
steps=steps,
sequence_start=sequence_start,
sequence_end=sequence_end,
adapter_names=adapter_names)
99 changes: 52 additions & 47 deletions lmdeploy/serve/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def prepare_inputs(self, prompts: Union[PromptType, List[PromptType]]):

def get_logits(
self,
input_ids: Union[InputIdsType, List[InputIdsType]],
inputs: Union[str, List[str]],
input_embeddings: Union[InputEmbsType, List[InputEmbsType]] = None,
input_embedding_ranges: Union[InputEmbRngsType,
List[InputEmbRngsType]] = None):
Expand All @@ -75,13 +75,17 @@ def get_logits(
input_ids (Union[List[int], List[List[int]]]): the batch of
input token ids
"""
assert len(input_ids) > 0
if isinstance(input_ids[0], int):
input_ids = [input_ids]
for input_id in input_ids:
assert len(input_id) > 0
if isinstance(inputs, str):
inputs = [inputs]
assert all(len(_) > 0 for _ in inputs)

input_ids = [self.tokenizer.encode(text) for text in inputs]
lvhan028 marked this conversation as resolved.
Show resolved Hide resolved
bs = len(input_ids)
# TODO: a better way to determine `max_input_len`, at most allocate
# 2G mem for logits with shape [bs, max_input_len, vocab_size]
vocab_size = self.hf_tm_cfg.vocab_size
max_input_len = 2 * 1024**3 // (bs * vocab_size * 4)

max_input_len = self.backend_config.max_prefill_token_num
n_max_iter = np.ceil(
max([len(input_id)
for input_id in input_ids]) / max_input_len).astype(int)
Expand Down Expand Up @@ -173,65 +177,65 @@ def _split_embeddings(input_ids, niter, iter_len, embeddings,
logits = torch.cat(logits, dim=1)
return logits

def get_ppl(self, input_ids: Union[List[int], List[List[int]]]):
"""Get perplexity scores given a list of input tokens.
def get_ppl(self, inputs: List[str]) -> List[float]:
"""Get perplexity scores given a list of inputs.
lvhan028 marked this conversation as resolved.
Show resolved Hide resolved

Args:
input_ids (Union[List[int], List[List[int]]]): the batch of
input token ids
inputs (List[str]): A list of strings.

Returns:
List[float]: A list of perplexity scores.
"""
assert len(input_ids) > 0
if isinstance(input_ids[0], int):
input_ids = [input_ids]
for input_id in input_ids:
assert len(input_id) > 1
if isinstance(inputs, str):
inputs = [inputs]
assert all(len(_) > 0 for _ in inputs)

max_input_len = self.backend_config.max_prefill_token_num
n_max_iter = np.ceil(
max([len(input_id)
for input_id in input_ids]) / max_input_len).astype(int)
generator = self.engine.create_instance()
input_ids = [self.tokenizer.encode(text) for text in inputs]

index_range_starts = []
index_range_ends = []
for input_id in input_ids:
index_range_start = np.array(
[i * max_input_len for i in range(n_max_iter)])
index_range_end = index_range_start + max_input_len
index_range_start[index_range_start >= len(input_id)] = len(
input_id)
index_range_end[index_range_end >= len(input_id)] = len(input_id)
index_range_starts.append(index_range_start)
index_range_ends.append(index_range_end)
bs = len(input_ids)
max_seq_len = len(input_ids[0])

lvhan028 marked this conversation as resolved.
Show resolved Hide resolved
# TODO: a better way to determine `max_input_len`, at most allocate
# 2G mem for logits with shape [bs, max_input_len, vocab_size]
vocab_size = self.hf_tm_cfg.vocab_size
max_input_len = 2 * 1024**3 // (bs * vocab_size * 4)

generator = self.engine.create_instance()
all_loss_matrix = []
all_target_mask = []
for i in range(n_max_iter):
steps = [start[i] for start in index_range_starts]
_input_ids = [
input_id[start[i]:end[i]] for input_id, start, end in zip(
input_ids, index_range_starts, index_range_ends)
for i in range(0, max_seq_len, max_input_len):
token_ids = [
input_id[i:i + max_input_len] for input_id in input_ids
]
_logits = generator.decode(_input_ids,
steps=steps,
sequence_start=(i == 0),
sequence_end=(i == n_max_iter - 1))
_logits = _logits.float().cpu()
steps = [i] * bs
logits = generator.decode(
token_ids,
steps=steps,
sequence_start=(i == 0),
sequence_end=(i + max_input_len >= max_seq_len))
bsz, seq_len, vocab_size = logits.shape
logits = logits.float().cpu()
padding_token_id = -100
target_ids = [(x + [padding_token_id])[1:] for x in _input_ids]
# meaning logits[..., :, :] corresponds to labels
# token_ids[1:] + predict_token_id, which is
# input_ids[:, i+max_input_len:i+max_input_len+1]
target_ids = [
input_id[i + 1:i + 1 + max_input_len] for input_id in input_ids
]
if len(target_ids[0]) < len(token_ids[0]):
target_ids = [x + [padding_token_id] for x in target_ids]
target_ids = [
torch.Tensor(torch.LongTensor(_target_ids))
for _target_ids in target_ids
]
target_ids = pad_sequence(target_ids,
batch_first=True,
padding_value=padding_token_id)
target_ids = target_ids.to(_logits.device)
target_ids = target_ids.to(logits.device)
target_mask = target_ids != padding_token_id
target_count = torch.sum(target_mask, dim=-1)

# compute cross entropy loss
bsz, seq_len, vocab_size = _logits.shape
flat_logits = _logits.contiguous().view(-1, vocab_size)
flat_logits = logits.contiguous().view(-1, vocab_size)
flat_target_ids = target_ids.contiguous().view(-1)
flat_loss_matrix = torch.nn.functional.cross_entropy(
flat_logits,
Expand All @@ -248,4 +252,5 @@ def get_ppl(self, input_ids: Union[List[int], List[List[int]]]):
loss_sum = torch.sum(all_loss_matrix * all_target_mask, dim=1)
loss_avg = loss_sum / target_count
loss_avg = loss_avg.cpu().numpy()

return loss_avg
4 changes: 4 additions & 0 deletions lmdeploy/turbomind/deploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,5 +135,9 @@ def weight_type(self):
def group_size(self):
return self.model_config.group_size

@property
def vocab_size(self):
return self.model_config.vocab_size

def __str__(self):
return json.dumps(self.to_dict(), indent=2)
Loading
Loading