diff --git a/examples/model_compression/pp-minilm/README.md b/examples/model_compression/pp-minilm/README.md index ed77ffc9993d..b8234c33f11e 100644 --- a/examples/model_compression/pp-minilm/README.md +++ b/examples/model_compression/pp-minilm/README.md @@ -81,7 +81,7 @@ PP-MiniLM 压缩方案以面向预训练模型的任务无关知识蒸馏(Task-a │ └── run_clue.sh # CLUE 上的微调启动脚本 │ └── run_one_search.sh # 单数据集下精调脚本 │ └── run_all_search.sh # CLUE数据集下精调脚本 -│ └── export_model.sh # 导出 fine-tuned 部署模型脚本 +│ └── export_model.py # 导出 fine-tuned 部署模型脚本 ├── pruning # 裁剪、蒸馏目录 │ └── prune.py # 裁剪、蒸馏脚本 │ └── prune.sh # 裁剪、蒸馏启动脚本 diff --git a/examples/model_compression/pp-minilm/general_distill/README.md b/examples/model_compression/pp-minilm/general_distill/README.md index df8767f5a50a..13f8c66dd010 100644 --- a/examples/model_compression/pp-minilm/general_distill/README.md +++ b/examples/model_compression/pp-minilm/general_distill/README.md @@ -29,9 +29,9 @@ cd .. 其中 `general_distill.py` 参数释义如下: -- `model_type` 指示了学生模型类型,当前仅支持 'ernie'、'roberta'。 +- `model_type` 指示了学生模型类型,当前仅支持 'ppminilm'、'roberta'。 - `num_relation_heads` relation head 的个数,一般对于 large-size 的教师模型是64,对于 base-size 的教师模型是 48。 -- `teacher_model_type`指示了教师模型类型,当前仅支持 'ernie'、'roberta'。 +- `teacher_model_type`指示了教师模型类型,当前仅支持 'roberta'。 - `teacher_layer_index`蒸馏时使用的教师模型的层 - `student_layer_index` 蒸馏时使用的学生模型的层 - `teacher_model_name_or_path`教师模型的名称,例如`'roberta-wwm-ext-large'` diff --git a/examples/model_compression/pp-minilm/general_distill/general_distill.py b/examples/model_compression/pp-minilm/general_distill/general_distill.py index 81f04f5e889f..d324bdbb7556 100644 --- a/examples/model_compression/pp-minilm/general_distill/general_distill.py +++ b/examples/model_compression/pp-minilm/general_distill/general_distill.py @@ -32,12 +32,12 @@ from paddlenlp.utils.tools import TimeCostAverage from paddlenlp.transformers import LinearDecayWithWarmup from paddlenlp.transformers import RobertaModel, RobertaTokenizer -from paddlenlp.transformers import ErnieModel, ErnieForSequenceClassification, ErnieTokenizer +from paddlenlp.transformers import PPMiniLMModel, PPMiniLMForSequenceClassification, PPMiniLMTokenizer from paddlenlp.transformers.distill_utils import to_distill, calc_multi_relation_loss MODEL_CLASSES = { "roberta": (RobertaModel, RobertaTokenizer), - "ernie": (ErnieForSequenceClassification, ErnieTokenizer) + "ppminilm": (PPMiniLMForSequenceClassification, PPMiniLMTokenizer) } @@ -47,14 +47,14 @@ def parse_args(): # Required parameters parser.add_argument( "--model_type", - default="ernie", + default="ppminilm", type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--teacher_model_type", - default="ernie", + default="roberta", type=str, required=True, help="Model type selected in the list: " + @@ -276,14 +276,14 @@ def do_train(args): # For student model_class, _ = MODEL_CLASSES[args.model_type] if args.num_layers == 6: - ernie = ErnieModel( + ppminilm = PPMiniLMModel( vocab_size=tokenizer.vocab_size, num_hidden_layers=6, hidden_act='relu', intermediate_size=3072, hidden_size=768) # layer: 6 elif args.num_layers == 4: - ernie = ErnieModel( + ppminilm = PPMiniLMModel( vocab_size=tokenizer.vocab_size, num_hidden_layers=4, hidden_act='relu', @@ -291,13 +291,13 @@ def do_train(args): hidden_size=256, num_attention_heads=16) # layer: 4 else: - ernie = ErnieModel( + ppminilm = PPMiniLMModel( vocab_size=tokenizer.vocab_size, num_hidden_layers=2, hidden_act='relu', hidden_size=128, intermediate_size=512) # layer: 2 - student = model_class(ernie) + student = model_class(ppminilm) teacher = teacher_model_class.from_pretrained( args.teacher_model_name_or_path) diff --git a/examples/model_compression/pp-minilm/general_distill/run.sh b/examples/model_compression/pp-minilm/general_distill/run.sh index 3db0d135973b..be940e7c6d8b 100644 --- a/examples/model_compression/pp-minilm/general_distill/run.sh +++ b/examples/model_compression/pp-minilm/general_distill/run.sh @@ -47,7 +47,7 @@ cp ../../../../paddlenlp/transformers/distill_utils.py ${output_dir}/ python3 -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" general_distill.py \ - --model_type ernie \ + --model_type ppminilm \ --num_relation_heads ${numH} \ --teacher_model_type ${teacher} \ --teacher_layer_index ${teacher_layer_index} \ diff --git a/paddlenlp/transformers/distill_utils.py b/paddlenlp/transformers/distill_utils.py index 3f67c0d022b1..c83cd8b045fd 100644 --- a/paddlenlp/transformers/distill_utils.py +++ b/paddlenlp/transformers/distill_utils.py @@ -21,7 +21,7 @@ from paddle.fluid.data_feeder import convert_dtype from paddlenlp.utils.log import logger -from paddlenlp.transformers import ErnieForSequenceClassification +from paddlenlp.transformers import PPMiniLMForSequenceClassification from paddlenlp.transformers import TinyBertForPretraining from paddlenlp.transformers import BertForSequenceClassification @@ -208,7 +208,7 @@ def to_distill(self, if return_qkv: # forward function of student class should be replaced for distributed training. TinyBertForPretraining._forward = minilm_pretraining_forward - ErnieForSequenceClassification._forward = minilm_pretraining_forward + PPMiniLMForSequenceClassification._forward = minilm_pretraining_forward else: TinyBertForPretraining._forward = tinybert_forward @@ -216,7 +216,7 @@ def init_func(layer): if isinstance(layer, (MultiHeadAttention, TransformerEncoderLayer, TransformerEncoder, TinyBertForPretraining, BertForSequenceClassification, - ErnieForSequenceClassification)): + PPMiniLMForSequenceClassification)): layer.forward = layer._forward if isinstance(layer, TransformerEncoder): layer.return_layer_outputs = return_layer_outputs