diff --git a/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/README.md b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/README.md new file mode 100644 index 0000000..d89058e --- /dev/null +++ b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/README.md @@ -0,0 +1,5 @@ +## 运行方法 +``` +# develop(这里card_num指的是运行的卡数,只能是2或4或8, develop必须在incubate之前运行, log_dir为输出log的路径。在传入数据时需要将数据文件存储到data文件夹下) +$ bash test_vocab_parallel_embedding.sh card_num develop log_dir +``` \ No newline at end of file diff --git a/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/get_and_test_paddle_result.py b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/get_and_test_paddle_result.py new file mode 100644 index 0000000..d380b89 --- /dev/null +++ b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/get_and_test_paddle_result.py @@ -0,0 +1,177 @@ +import numpy as np +import paddle +import paddle.distributed as paddle_dist +import paddle.distributed.fleet as fleet +import init_config_class +import random +import sys +sys.path.append("../../..") +from utils import ( + np_assert_accuracy +) + +global_out = [] +global_dout = [] + +def set_random_seed(seed): + """Set random seed for reproducability.""" + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + fleet.meta_parallel.model_parallel_random_seed(seed) + +class TestPaddle(init_config_class.InitConfigClass): + def __init__(self, group, test_mode=1, np_input_dir_forward="", np_input_dir_backward="", dtype=""): + self._init_params(np_input_dir_forward, np_input_dir_backward, dtype) + self._init_threshold() + self._init_np_inputs_and_dout() + self._group = group + self.test_mode = test_mode + + def _gen_eager_inputs_and_dout(self): + place = paddle.device.get_device() + x_eager = paddle.to_tensor( + self._np_x, + dtype="int64", + place=place, + ) + x_eager.stop_gradient = False + table_eager = paddle.to_tensor( + self._np_table, + dtype=self._dtype if self._dtype != 'bfloat16' else "float32", + place=place, + ) + table_eager.stop_gradient = False + dout_eager = paddle.to_tensor( + self._np_dout, + dtype=self._dtype if self._dtype != 'bfloat16' else "float32", + place=place, + ) + dout_eager.stop_gradient = False + return x_eager, table_eager, dout_eager + + def _cal_eager_res(self, x, table, dout): + x_t = x + table_t = table + dout_t = dout + + if self._dtype == "float32": + table_t = paddle.cast(table, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + table_t = paddle.cast(table, dtype="float32") + dout_t = paddle.cast(dout, dtype="float32") + + if self._dtype == "bfloat16": + table_t = paddle.cast(table, dtype="uint16") + dout_t = paddle.cast(dout, dtype="uint16") + + embedding = fleet.meta_parallel.VocabParallelEmbedding(self._num_embeddings, self._embedding_dim, mp_group=self._group) + paddle.assign(table_t, embedding.weight) + out = embedding(x_t) + + out_grads = paddle.grad( + [out], [embedding.weight], grad_outputs=[dout_t] + ) + + out_grads = out_grads[0] + + if self._dtype == "bfloat16": + out = paddle.cast(out, dtype="float32") + out_grads = paddle.cast(out_grads, dtype="float32") + + if self.test_mode == 2 and self._dtype == "float32": + out = paddle.cast(out, dtype="uint16") + out_grads = paddle.cast(out_grads, dtype="uint16") + out = paddle.cast(out, dtype="float32") + out_grads = paddle.cast(out_grads, dtype="float32") + + return out, out_grads + + def _test_eager_accuracy(self): + x_eager, table_eager, dout_eager = self._gen_eager_inputs_and_dout() + out_eager, out_grads_eager = self._cal_eager_res(x_eager, table_eager, dout_eager) + + del x_eager + del table_eager + del dout_eager + paddle.device.cuda.empty_cache() + out_eager_np = out_eager.numpy() + out_grads_eager_np = out_grads_eager.numpy() + + global_out.append(out_eager_np) + global_dout.append(out_grads_eager_np) + + del out_eager + del out_grads_eager + paddle.device.cuda.empty_cache() + + +dtype_list = ["float32", "bfloat16"] + +dist_strategy = fleet.DistributedStrategy() +world_size = paddle_dist.get_world_size() +dist_strategy.hybrid_configs = { + "mp_degree": world_size, + "pp_degree": 1, + "dp_degree": 1, +} +rank = paddle_dist.get_rank() +paddle_dist.fleet.init(is_collective=True, strategy = dist_strategy) + +set_random_seed(1024) + +group = paddle_dist.collective._get_default_group() + +for test_mode in [1,2]: + print("test_mode_{test_mode} start*************************************************************************" \ + .format(test_mode=test_mode)) + + if test_mode == 1: + atol = 1e-2 + elif test_mode == 2: + atol = 1e-6 + + global_out.clear() + global_dout.clear() + for dtype in dtype_list: + + np_input_dir_forward = "./data/vpe-int64-bf16-bf16-eager_in_tmp_477-word_embedding_expanded_{rank}.w_0-eager_in_tmp_486-pp-0-mp-{rank}.npz".format(rank=rank) + np_input_dir_backward = "./data/vpe-int64-bf16-bf16-eager_in_tmp_477-word_embedding_expanded_{rank}.w_0-eager_in_tmp_486-pp-0-mp-{rank}.npy".format(rank=rank) + + test_paddle = TestPaddle(group, test_mode, np_input_dir_forward, np_input_dir_backward, dtype) + test_paddle._test_eager_accuracy() + + try: + np_assert_accuracy( + global_out[0], + global_out[1], + atol, + atol, + "fp32_vs_bf16", + version_a="fp32", + version_b="bf16", + eager_or_static_mode="eager", + fwd_or_bkd="forward", + api="fleet.meta_parallel.VocabParallelEmbedding", + ) + except Exception as e: + print(e) + + try: + np_assert_accuracy( + global_dout[0], + global_dout[1], + atol, + atol, + "fp32_vs_bf16", + version_a="fp32", + version_b="bf16", + eager_or_static_mode="eager", + fwd_or_bkd="backward", + api="fleet.meta_parallel.VocabParallelEmbedding", + ) + except Exception as e: + print(e) + print("test_mode_{test_mode} end*************************************************************************" \ + .format(test_mode=test_mode)) + diff --git a/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/init_config_class.py b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/init_config_class.py new file mode 100644 index 0000000..3d7f2cd --- /dev/null +++ b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/init_config_class.py @@ -0,0 +1,29 @@ +import numpy as np +import sys +sys.path.append("../../..") +from utils import TOLERANCE, convert_dtype_to_torch_type + +class InitConfigClass: + def __init__(self): + self._init_params() + self._init_threshold() + self._init_np_inputs_and_dout() + + def _init_params(self, np_input_dir_forward="", np_input_dir_backward="", dtype=""): + self._np_input_dir_forward = np_input_dir_forward + self._np_input_dir_backward = np_input_dir_backward + self._dtype = dtype + + def _init_threshold(self): + self._atol = TOLERANCE[self._dtype]["atol"] + self._rtol = TOLERANCE[self._dtype]["rtol"] + + def _init_np_inputs_and_dout(self): + np_inputs_array_forward = np.load(self._np_input_dir_forward) + np_inputs_array_backward = np.load(self._np_input_dir_backward) + # get np array from npz file + self._np_x = np_inputs_array_forward["x"] + self._np_table = np_inputs_array_forward["weight"] + self._np_dout = np_inputs_array_backward + self._num_embeddings = np_inputs_array_forward['num_embeddings'] + self._embedding_dim = np_inputs_array_forward['embedding_dim'] \ No newline at end of file diff --git a/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/prepare_data.py b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/prepare_data.py new file mode 100644 index 0000000..d7407c3 --- /dev/null +++ b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/prepare_data.py @@ -0,0 +1,29 @@ +import numpy as np +import init_config_class + +dim_1_list = [31250] +dim_2_list = [8192] +dim_3_list = [14336] + +card_num = 8 + +def generate_np_inputs_and_dout(): + + for i in range(8): + np.random.seed(0) + + dim_1 = dim_1_list[0] + dim_2 = dim_2_list[0] + dim_3 = dim_3_list[0] + x_case1 = np.random.randint(low=0, high=dim_1, size=[1, dim_2]).astype("int64") + table_case1 = np.random.random(size=[dim_1, dim_3]).astype("float32") - 0.5 + dout_case1 = np.random.random(size=[1, dim_2, dim_3]).astype("float32") - 0.5 + + np_input_dir_forward = "./data/vpe-int64-bf16-bf16-eager_in_tmp_477-word_embedding_expanded_{rank}.w_0-eager_in_tmp_486-pp-0-mp-{rank}.npz".format(rank=i) + np_input_dir_backward = "./data/vpe-int64-bf16-bf16-eager_in_tmp_477-word_embedding_expanded_{rank}.w_0-eager_in_tmp_486-pp-0-mp-{rank}.npy".format(rank=i) + + np.savez(np_input_dir_forward, x = x_case1, weight = table_case1, num_embeddings=dim_1 * card_num, embedding_dim=dim_3) + np.save(np_input_dir_backward, dout_case1) + + +generate_np_inputs_and_dout() \ No newline at end of file diff --git a/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/test_vocab_parallel_embedding.sh b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/test_vocab_parallel_embedding.sh new file mode 100644 index 0000000..619643c --- /dev/null +++ b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_eb_real_data/test_vocab_parallel_embedding.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -ex + +card_num=$1 +version=$2 +log_dir=$3 + +export NVIDIA_TF32_OVERRIDE=0 + +case $card_num in + 2 ) export CUDA_VISIBLE_DEVICES=0,1 ;; + 4 ) export CUDA_VISIBLE_DEVICES=0,1,2,3 ;; + 8 ) export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ;; + * ) + echo '请输入正确的卡数' + exit + ;; +esac + +#please run "bash test_vocab_parallel_embedding.sh * develop" first + +if [ "$version" == 'develop' ]; then + # rm -rf data/*.npz + # python prepare_data.py + python -m paddle.distributed.launch --log_dir $log_dir get_and_test_paddle_result.py +else + echo "请输入develop或者incubate" +fi \ No newline at end of file diff --git a/test_fp32_with_bf16/test_vocab_parallel_embedding/README.md b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/README.md similarity index 100% rename from test_fp32_with_bf16/test_vocab_parallel_embedding/README.md rename to test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/README.md diff --git a/test_fp32_with_bf16/test_vocab_parallel_embedding/get_and_test_paddle_result.py b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/get_and_test_paddle_result.py similarity index 99% rename from test_fp32_with_bf16/test_vocab_parallel_embedding/get_and_test_paddle_result.py rename to test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/get_and_test_paddle_result.py index 47bd657..e4cbdb2 100644 --- a/test_fp32_with_bf16/test_vocab_parallel_embedding/get_and_test_paddle_result.py +++ b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/get_and_test_paddle_result.py @@ -5,7 +5,7 @@ import init_config_class import random import sys -sys.path.append("../..") +sys.path.append("../../..") from utils import ( np_assert_accuracy ) diff --git a/test_fp32_with_bf16/test_vocab_parallel_embedding/init_config_class.py b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/init_config_class.py similarity index 97% rename from test_fp32_with_bf16/test_vocab_parallel_embedding/init_config_class.py rename to test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/init_config_class.py index 83e6316..163551d 100644 --- a/test_fp32_with_bf16/test_vocab_parallel_embedding/init_config_class.py +++ b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/init_config_class.py @@ -1,6 +1,6 @@ import numpy as np import sys -sys.path.append("../..") +sys.path.append("../../..") from utils import TOLERANCE, convert_dtype_to_torch_type card_num = 8 diff --git a/test_fp32_with_bf16/test_vocab_parallel_embedding/prepare_data.py b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/prepare_data.py similarity index 100% rename from test_fp32_with_bf16/test_vocab_parallel_embedding/prepare_data.py rename to test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/prepare_data.py diff --git a/test_fp32_with_bf16/test_vocab_parallel_embedding/test_vocab_parallel_embedding.sh b/test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/test_vocab_parallel_embedding.sh similarity index 100% rename from test_fp32_with_bf16/test_vocab_parallel_embedding/test_vocab_parallel_embedding.sh rename to test_fp32_with_bf16/test_vocab_parallel_embedding/test_random_data/test_vocab_parallel_embedding.sh