InternLM · lvhan028 · Sep 26, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024
diff --git a/docs/en/advance/long_context.md b/docs/en/advance/long_context.md
@@ -96,11 +96,9 @@ This test takes approximately 364 seconds per round when conducted on A100-80G G
 The following codes demonstrate how to use LMDeploy to calculate perplexity.
 
 ```python
-from transformers import AutoTokenizer
 from lmdeploy import TurbomindEngineConfig, pipeline
-import numpy as np
 
-# load model and tokenizer
+# build pipeline
 model_repoid_or_path = 'internlm/internlm2_5-7b-chat-1m'
 backend_config = TurbomindEngineConfig(
         rope_scaling_factor=2.5,
@@ -109,11 +107,9 @@ backend_config = TurbomindEngineConfig(
         cache_max_entry_count=0.7,
         tp=4)
 pipe = pipeline(model_repoid_or_path, backend_config=backend_config)
-tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True)
 
 # get perplexity
 text = 'Use a long prompt to replace this sentence'
-input_ids = tokenizer.encode(text)
-ppl = pipe.get_ppl(input_ids)[0]
+ppl = pipe.get_ppl(text)
 print(ppl)
 ```
diff --git a/docs/en/llm/pipeline.md b/docs/en/llm/pipeline.md
@@ -119,21 +119,21 @@ for item in pipe.stream_infer(prompts, gen_config=gen_config):
 - **An example to cauculate logits & ppl:**
 
 ```python
-from transformers import AutoTokenizer
 from lmdeploy import pipeline
+
 model_repoid_or_path='internlm/internlm2_5-7b-chat'
 pipe = pipeline(model_repoid_or_path)
-tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True)
 
-# logits
-messages = [
-   {"role": "user", "content": "Hello, how are you?"},
+prompts = [
+    "Hello, I am an AI assistant named InternLM. I am developed by Shanghai AI Laboratory",
+    "How to use LMDeploy to deploy a LLM model?"
 ]
-input_ids = tokenizer.apply_chat_template(messages)
-logits = pipe.get_logits(input_ids)
+
+# logits
+logits = pipe.get_logits(prompts)
 
 # ppl
-ppl = pipe.get_ppl(input_ids)
+ppl = pipe.get_ppl(prompts)
 ```
 
 - **Below is an example for pytorch backend. Please install triton first.**

diff --git a/docs/zh_cn/advance/long_context.md b/docs/zh_cn/advance/long_context.md
@@ -96,11 +96,9 @@ passkey_retrieval(session_len, 5)
 下面展示使用 LMDeploy 计算困惑度的用法
 
 ```python
-from transformers import AutoTokenizer
 from lmdeploy import TurbomindEngineConfig, pipeline
-import numpy as np
 
-# load model and tokenizer
+# build pipeline
 model_repoid_or_path = 'internlm/internlm2_5-7b-chat-1m'
 backend_config = TurbomindEngineConfig(
         rope_scaling_factor=2.5,
@@ -109,11 +107,9 @@ backend_config = TurbomindEngineConfig(
         cache_max_entry_count=0.7,
         tp=4)
 pipe = pipeline(model_repoid_or_path, backend_config=backend_config)
-tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True)
 
 # get perplexity
 text = 'Use a long prompt to replace this sentence'
-input_ids = tokenizer.encode(text)
-loss = pipe.get_ppl(input_ids)[0]
+loss = pipe.get_ppl(text)
 print(ppl)
 ```
diff --git a/docs/zh_cn/llm/pipeline.md b/docs/zh_cn/llm/pipeline.md
@@ -119,21 +119,21 @@ for item in pipe.stream_infer(prompts, gen_config=gen_config):
 - **计算 logits & ppl:**
 
 ```python
-from transformers import AutoTokenizer
 from lmdeploy import pipeline
+
 model_repoid_or_path='internlm/internlm2_5-7b-chat'
 pipe = pipeline(model_repoid_or_path)
-tokenizer = AutoTokenizer.from_pretrained(model_repoid_or_path, trust_remote_code=True)
 
-# logits
-messages = [
-   {"role": "user", "content": "Hello, how are you?"},
+prompts = [
+    "Hello, I am an AI assistant named InternLM. I am developed by Shanghai AI Laboratory",
+    "How to use LMDeploy to deploy a LLM model?"
 ]
-input_ids = tokenizer.apply_chat_template(messages)
-logits = pipe.get_logits(input_ids)
+
+# logits
+logits = pipe.get_logits(prompts)
 
 # ppl
-ppl = pipe.get_ppl(input_ids)
+ppl = pipe.get_ppl(prompts)
 ```
 
 - **使用 pytorch 后端**

diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -1046,34 +1046,3 @@ async def async_end(self, session_id: int):
     def end(self, session_id: int):
         """Add new session."""
         return self.engine_instance.end(session_id)
-
-    def decode(self,
-               input_ids,
-               input_embeddings: List[InputEmbeddingType] = None,
-               input_embedding_ranges: List[InputEmbeddingRangeType] = None,
-               steps: List[int] = None,
-               sequence_start: bool = True,
-               sequence_end: bool = True,
-               adapter_names: List[str] = None):
-        """Perform context decode on input tokens.
-
-        Args:
-            input_ids (List[List[int]] | List[np.ndaray]): the batch of input
-                 token ids
-            steps (List[int]): the offset of the k/v cache
-            input_embeddings (List[List[Union[torch.Tensor, np.ndarray]]]):
-                embeddings features
-            input_embedding_ranges: (List[List[Tuple[int, int]]]):
-                the begin/end offsets of input_embeddings to input_ids
-            sequence_start (bool): indicator for starting a sequence
-            sequence_end (bool): indicator for ending a sequence
-            adapter_names (List[str]): The name of the adapters.
-        """
-        return self.engine_instance.decode(
-            input_ids,
-            input_embeddings=input_embeddings,
-            input_embedding_ranges=input_embedding_ranges,
-            steps=steps,
-            sequence_start=sequence_start,
-            sequence_end=sequence_end,
-            adapter_names=adapter_names)
diff --git a/lmdeploy/serve/utils.py b/lmdeploy/serve/utils.py
@@ -65,7 +65,7 @@ def prepare_inputs(self, prompts: Union[PromptType, List[PromptType]]):
 
     def get_logits(
         self,
-        input_ids: Union[InputIdsType, List[InputIdsType]],
+        inputs: Union[str, List[str]],
         input_embeddings: Union[InputEmbsType, List[InputEmbsType]] = None,
         input_embedding_ranges: Union[InputEmbRngsType,
                                       List[InputEmbRngsType]] = None):
@@ -75,13 +75,17 @@ def get_logits(
             input_ids (Union[List[int], List[List[int]]]): the batch of
                 input token ids
         """
-        assert len(input_ids) > 0
-        if isinstance(input_ids[0], int):
-            input_ids = [input_ids]
-        for input_id in input_ids:
-            assert len(input_id) > 0
+        if isinstance(inputs, str):
+            inputs = [inputs]
+        assert all(len(_) > 0 for _ in inputs)
+
+        input_ids = [self.tokenizer.encode(text) for text in inputs]
+        bs = len(input_ids)
+        # TODO: a better way to determine `max_input_len`, at most allocate
+        # 2G mem for logits with shape [bs, max_input_len, vocab_size]
+        vocab_size = self.hf_tm_cfg.vocab_size
+        max_input_len = 2 * 1024**3 // (bs * vocab_size * 4)
 
-        max_input_len = self.backend_config.max_prefill_token_num
         n_max_iter = np.ceil(
             max([len(input_id)
                  for input_id in input_ids]) / max_input_len).astype(int)
@@ -173,65 +177,65 @@ def _split_embeddings(input_ids, niter, iter_len, embeddings,
         logits = torch.cat(logits, dim=1)
         return logits
 
-    def get_ppl(self, input_ids: Union[List[int], List[List[int]]]):
-        """Get perplexity scores given a list of input tokens.
+    def get_ppl(self, inputs: List[str]) -> List[float]:
+        """Get perplexity scores given a list of inputs.
 
         Args:
-            input_ids (Union[List[int], List[List[int]]]): the batch of
-                input token ids
+            inputs (List[str]): A list of strings.
+
+        Returns:
+            List[float]: A list of perplexity scores.
         """
-        assert len(input_ids) > 0
-        if isinstance(input_ids[0], int):
-            input_ids = [input_ids]
-        for input_id in input_ids:
-            assert len(input_id) > 1
+        if isinstance(inputs, str):
+            inputs = [inputs]
+        assert all(len(_) > 0 for _ in inputs)
 
-        max_input_len = self.backend_config.max_prefill_token_num
-        n_max_iter = np.ceil(
-            max([len(input_id)
-                 for input_id in input_ids]) / max_input_len).astype(int)
+        generator = self.engine.create_instance()
+        input_ids = [self.tokenizer.encode(text) for text in inputs]
 
-        index_range_starts = []
-        index_range_ends = []
-        for input_id in input_ids:
-            index_range_start = np.array(
-                [i * max_input_len for i in range(n_max_iter)])
-            index_range_end = index_range_start + max_input_len
-            index_range_start[index_range_start >= len(input_id)] = len(
-                input_id)
-            index_range_end[index_range_end >= len(input_id)] = len(input_id)
-            index_range_starts.append(index_range_start)
-            index_range_ends.append(index_range_end)
+        bs = len(input_ids)
+        max_seq_len = len(input_ids[0])
+
+        # TODO: a better way to determine `max_input_len`, at most allocate
+        # 2G mem for logits with shape [bs, max_input_len, vocab_size]
+        vocab_size = self.hf_tm_cfg.vocab_size
+        max_input_len = 2 * 1024**3 // (bs * vocab_size * 4)
 
-        generator = self.engine.create_instance()
         all_loss_matrix = []
         all_target_mask = []
-        for i in range(n_max_iter):
-            steps = [start[i] for start in index_range_starts]
-            _input_ids = [
-                input_id[start[i]:end[i]] for input_id, start, end in zip(
-                    input_ids, index_range_starts, index_range_ends)
+        for i in range(0, max_seq_len, max_input_len):
+            token_ids = [
+                input_id[i:i + max_input_len] for input_id in input_ids
             ]
-            _logits = generator.decode(_input_ids,
-                                       steps=steps,
-                                       sequence_start=(i == 0),
-                                       sequence_end=(i == n_max_iter - 1))
-            _logits = _logits.float().cpu()
+            steps = [i] * bs
+            logits = generator.decode(
+                token_ids,
+                steps=steps,
+                sequence_start=(i == 0),
+                sequence_end=(i + max_input_len >= max_seq_len))
+            bsz, seq_len, vocab_size = logits.shape
+            logits = logits.float().cpu()
             padding_token_id = -100
-            target_ids = [(x + [padding_token_id])[1:] for x in _input_ids]
+            # meaning logits[..., :, :] corresponds to labels
+            # token_ids[1:] + predict_token_id, which is
+            # input_ids[:, i+max_input_len:i+max_input_len+1]
+            target_ids = [
+                input_id[i + 1:i + 1 + max_input_len] for input_id in input_ids
+            ]
+            if len(target_ids[0]) < len(token_ids[0]):
+                target_ids = [x + [padding_token_id] for x in target_ids]
             target_ids = [
                 torch.Tensor(torch.LongTensor(_target_ids))
                 for _target_ids in target_ids
             ]
             target_ids = pad_sequence(target_ids,
                                       batch_first=True,
                                       padding_value=padding_token_id)
-            target_ids = target_ids.to(_logits.device)
+            target_ids = target_ids.to(logits.device)
             target_mask = target_ids != padding_token_id
-            target_count = torch.sum(target_mask, dim=-1)
+
             # compute cross entropy loss
-            bsz, seq_len, vocab_size = _logits.shape
-            flat_logits = _logits.contiguous().view(-1, vocab_size)
+            flat_logits = logits.contiguous().view(-1, vocab_size)
             flat_target_ids = target_ids.contiguous().view(-1)
             flat_loss_matrix = torch.nn.functional.cross_entropy(
                 flat_logits,
@@ -248,4 +252,5 @@ def get_ppl(self, input_ids: Union[List[int], List[List[int]]]):
         loss_sum = torch.sum(all_loss_matrix * all_target_mask, dim=1)
         loss_avg = loss_sum / target_count
         loss_avg = loss_avg.cpu().numpy()
+
         return loss_avg
diff --git a/lmdeploy/turbomind/deploy/config.py b/lmdeploy/turbomind/deploy/config.py
@@ -135,5 +135,9 @@ def weight_type(self):
     def group_size(self):
         return self.model_config.group_size
 
+    @property
+    def vocab_size(self):
+        return self.model_config.vocab_size
+
     def __str__(self):
         return json.dumps(self.to_dict(), indent=2)