Skip to content

Commit

Permalink
Support Qwen2-MoE models (#2723)
Browse files Browse the repository at this point in the history
* add qwen2-moe

* eliminate `inter_size_` from ffn layer

* clean up

* fix lint

* clean up

* Update config.yaml

---------

Co-authored-by: zhulinJulia24 <[email protected]>
  • Loading branch information
lzhangzz and zhulinJulia24 authored Nov 13, 2024
1 parent e751708 commit d2d4209
Show file tree
Hide file tree
Showing 19 changed files with 711 additions and 190 deletions.
2 changes: 2 additions & 0 deletions autotest/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,12 @@ turbomind_chat_model:
- Qwen/Qwen2-1.5B-Instruct
- Qwen/Qwen1.5-7B-Chat
- Qwen/Qwen1.5-4B-Chat-AWQ
- Qwen/Qwen1.5-MoE-A2.7B-Chat
- Qwen/Qwen-VL-Chat
- Qwen/Qwen2.5-0.5B-Instruct
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen2-7B-Instruct-GPTQ-Int4
- Qwen/Qwen2-57B-A14B-Instruct-GPTQ-Int4
- mistralai/Mistral-7B-Instruct-v0.3
- mistralai/Mixtral-8x7B-Instruct-v0.1
- lmdeploy/llama2-chat-7b-w4
Expand Down
2 changes: 2 additions & 0 deletions lmdeploy/turbomind/deploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ class ModelConfig:
expert_num: int = 0
expert_inter_size: int = 0
experts_per_token: int = 0
moe_shared_gate: int = False
moe_norm_topk: int = False

def verify(self):
invalid = {}
Expand Down
21 changes: 17 additions & 4 deletions lmdeploy/turbomind/deploy/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,18 @@ class MoeFfn(Ffn):
requires:
r.moe_ffn_expert(e, i, kind)
r.moe_ffn_gate(i)
r.moe_ffn_shared_gate(i)
"""

_moe_ffn_expert = 'layers.{0}.moe_ffn.experts.E.{1}.{2}'
_moe_ffn_gate = 'layers.{0}.moe_ffn.gate.{1}'
_moe_ffn_gate = 'layers.{0}.moe_ffn.gate.weight'
_moe_ffn_shared_gate = 'layers.{0}.moe_ffn.shared_gate.weight'

def __init__(self, model: BaseOutputModel):
super().__init__(model)
self.expert_num = model.model_config.expert_num
self.inter_size = model.model_config.expert_inter_size
self.shared_gate = model.model_config.moe_shared_gate

def apply(self, i: int, r: BaseReader):
for p in get_params(r.moe_ffn_expert()):
Expand All @@ -157,7 +161,13 @@ def apply(self, i: int, r: BaseReader):
i)

gate = transpose(r.moe_ffn_gate(i))
self.model.save_split(gate, self._moe_ffn_gate.format(i, 'weight'))
self.model.save_split(gate, self._moe_ffn_gate.format(i))

if self.shared_gate:
shared_gate = transpose(r.moe_ffn_shared_gate(i))
# print(shared_gate)
self.model.save_split(shared_gate,
self._moe_ffn_shared_gate.format(i))


class Attn(Module):
Expand Down Expand Up @@ -248,8 +258,11 @@ class Transformer:

def __init__(self, model: BaseOutputModel):
self.model = model
ffn = MoeFfn if model.model_config.expert_num else Ffn
modules = [Attn, LayerNorm, ffn]
modules = [Attn, LayerNorm]
if model.model_config.inter_size:
modules.append(Ffn)
if model.model_config.expert_num:
modules.append(MoeFfn)
self.modules = [c(model) for c in modules]
self.misc = Misc(model)

Expand Down
2 changes: 2 additions & 0 deletions lmdeploy/turbomind/deploy/source_model/mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,6 @@ def model_info(self):
info['expert_num'] = cfg['num_local_experts']
info['expert_inter_size'] = cfg['intermediate_size']
info['experts_per_token'] = cfg['num_experts_per_tok']
info['moe_norm_topk'] = True
info['inter_size'] = 0
return info
61 changes: 61 additions & 0 deletions lmdeploy/turbomind/deploy/source_model/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,64 @@ def model_info(self):
cfg = super().model_info()
cfg['attn_bias'] = 1
return cfg


class Qwen2MoeReader(LlamaReader):

ffn_pattern = r'shared_expert\.'

def moe_ffn_expert(self, e=None, i=None, kind=None):
if not kind:
return self.filter(r'experts')
result = []
for key in ['gate', 'down', 'up']:
name = f'model.layers.{i}.mlp.experts.{e}.{key}_proj.{kind}'
tensor = self.params.get(name)
tensor = self.transform(tensor, kind)
result.append(tensor)
return (*result, )

def moe_ffn_gate(self, i):
return self.params.get(f'model.layers.{i}.mlp.gate.weight')

def _ffn(self, i: int, kind: str):
"""Get ffn kind for layer i."""
if not kind:
return self.filter(self.ffn_pattern)
result = []
for key in ['gate', 'down', 'up']:
tensor = self.params[
f'model.layers.{i}.mlp.shared_expert.{key}_proj.{kind}']
tensor = self.transform(tensor, kind)
result.append(tensor)
return (*result, )

def moe_ffn_shared_gate(self, i):
return self.params.get(
f'model.layers.{i}.mlp.shared_expert_gate.weight')


@INPUT_MODELS.register_module(name='qwen2-moe')
class Qwen2MoeModel(LlamaModel):

Reader = Qwen2MoeReader

def tokenizer_info(self):
"""https://huggingface.co/Qwen/Qwen1.5-7B-Chat/blob/main/generation_con
fig.json.""" # noqa: E501
n_words = 152064
bos_id = 151643
eos_id = 151645
return n_words, bos_id, eos_id

def model_info(self):
cfg = self.model_config
info = super().model_info()
info['expert_num'] = cfg['num_experts']
info['expert_inter_size'] = cfg['moe_intermediate_size']
info['experts_per_token'] = cfg['num_experts_per_tok']
info['inter_size'] = cfg['shared_expert_intermediate_size']
info['moe_shared_gate'] = True
info['moe_norm_topk_prob'] = cfg['norm_topk_prob']
info['attn_bias'] = 1
return info
1 change: 1 addition & 0 deletions lmdeploy/turbomind/supported_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
QWenLMHeadModel='qwen',
# Qwen2
Qwen2ForCausalLM='qwen2',
Qwen2MoeForCausalLM='qwen2-moe',
# mistral
MistralForCausalLM='llama',
# llava
Expand Down
Loading

0 comments on commit d2d4209

Please sign in to comment.