|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +# Test training benchmark for a model. |
| 18 | +# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num} |
| 19 | +function _set_params(){ |
| 20 | + model_item=${model_item:-"CE_llama7b_autotuner"} |
| 21 | + run_mode=${run_mode:-"pretrain"} |
| 22 | + device_num=${device_num:-"N1C8"} |
| 23 | + global_batch_size=${global_batch_size:-8} |
| 24 | + autoconfig_json_file=${autoconfig_json_file:-"autoconfig/llama7b_pretrain.json"} |
| 25 | + modle_json_file=${modle_json_file:-"autoconfig/llama7b_pretrain_params.json"} |
| 26 | + |
| 27 | + base_batch_size=${global_batch_size} |
| 28 | + |
| 29 | + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 |
| 30 | + model_repo="PaddleNLP" # (必选) 模型套件的名字 |
| 31 | + speed_unit="tokens/s" # (必选)速度指标单位 |
| 32 | + skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step |
| 33 | + keyword="ips:" # (必选)解析日志,筛选出性能数据所在行的关键字 |
| 34 | + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" |
| 35 | + |
| 36 | + fp_item="fp16" |
| 37 | + workerlog_id=0 |
| 38 | + # 以下为通用执行命令,无特殊可不用修改 |
| 39 | + model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 |
| 40 | + device=${CUDA_VISIBLE_DEVICES//,/ } |
| 41 | + arr=(${device}) |
| 42 | + num_gpu_devices=${#arr[*]} |
| 43 | + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 |
| 44 | + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 |
| 45 | + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} |
| 46 | + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log |
| 47 | + mkdir -p $(dirname ${train_log_file}) |
| 48 | + |
| 49 | + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling |
| 50 | + mkdir -p $(dirname ${profiling_log_file}) |
| 51 | + |
| 52 | + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed |
| 53 | + mkdir -p $(dirname ${speed_log_file}) |
| 54 | + |
| 55 | + OUTPUT_PATH=${run_log_path}/output |
| 56 | + is_large_model=True |
| 57 | +} |
| 58 | + |
| 59 | +function _train(){ |
| 60 | + batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs |
| 61 | + |
| 62 | + if [ -d $OUTPUT_PATH ]; then |
| 63 | + rm -rf $OUTPUT_PATH |
| 64 | + fi |
| 65 | + mkdir $OUTPUT_PATH |
| 66 | + |
| 67 | + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" |
| 68 | + |
| 69 | + if [ ${profiling} = "true" ];then |
| 70 | + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" |
| 71 | + log_file=${profiling_log_file} |
| 72 | + else |
| 73 | + add_options="" |
| 74 | + log_file=${train_log_file} |
| 75 | + fi |
| 76 | + |
| 77 | + if [ ${PADDLE_TRAINER_ID} ] |
| 78 | + then |
| 79 | + PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" |
| 80 | + else |
| 81 | + PADDLE_RANK_OPTION="" |
| 82 | + fi |
| 83 | + # 以下为通用执行命令,无特殊可不用修改 |
| 84 | + case ${run_mode} in |
| 85 | + pretrain) echo "Run with: run_mode=${run_mode}" |
| 86 | + train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ |
| 87 | + --auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}" |
| 88 | + ;; |
| 89 | + *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" |
| 90 | + train_cmd="python -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 ${PADDLE_RANK_OPTION}\ |
| 91 | + --auto_tuner_json ${autoconfig_json_file} run_pretrain.py ${modle_json_file}" |
| 92 | + ;; |
| 93 | + esac |
| 94 | + cd ../llm/llama |
| 95 | + echo "train_cmd: ${train_cmd} log_file: ${log_file}" |
| 96 | + python -c "import paddlenlp" |
| 97 | + if [[ ${model_item} =~ "CE" ]];then # CE精度-不限制执行时间 |
| 98 | + ${train_cmd} > ${log_file} 2>&1 |
| 99 | + else |
| 100 | + timeout 30m ${train_cmd} > ${log_file} 2>&1 |
| 101 | + fi |
| 102 | + if [ $? -ne 0 ];then |
| 103 | + echo -e "${model_name}, FAIL" >> ${log_file} |
| 104 | + else |
| 105 | + echo -e "${model_name}, SUCCESS" >> ${log_file} |
| 106 | + fi |
| 107 | + bash autoconfig/check.sh ${autoconfig_json_file} >> ${log_file} 2>&1 |
| 108 | + if [ $? -ne 0 ];then |
| 109 | + echo -e "auto_tuner, FAIL" >> ${log_file} |
| 110 | + else |
| 111 | + echo -e "auto_tuner, SUCCESS" >> ${log_file} |
| 112 | + fi |
| 113 | + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` |
| 114 | + if [ ${device_num} != "N1C1" -a -d ./autoconfig/best_cfg ]; then |
| 115 | + case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog |
| 116 | + cp -r ${case_path}/autoconfig/best_cfg/workerlog.* ./mylog/ |
| 117 | + fi |
| 118 | +} |
| 119 | + |
| 120 | +export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH |
| 121 | +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 |
| 122 | +_set_params $@ |
| 123 | +#_train # 如果只产出训练log,不解析,可取消注释 |
| 124 | +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 |
0 commit comments