Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
## 运行方法
```
# develop(这里card_num指的是运行的卡数,只能是2或4或8, develop必须在incubate之前运行, log_dir为输出log的路径。在传入数据时需要将数据文件存储到data文件夹下)
$ bash test_vocab_parallel_embedding.sh card_num develop log_dir
```
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import numpy as np
import paddle
import paddle.distributed as paddle_dist
import paddle.distributed.fleet as fleet
import init_config_class
import random
import sys
sys.path.append("../../..")
from utils import (
np_assert_accuracy
)

global_out = []
global_dout = []

def set_random_seed(seed):
"""Set random seed for reproducability."""
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
fleet.meta_parallel.model_parallel_random_seed(seed)

class TestPaddle(init_config_class.InitConfigClass):
def __init__(self, group, test_mode=1, np_input_dir_forward="", np_input_dir_backward="", dtype=""):
self._init_params(np_input_dir_forward, np_input_dir_backward, dtype)
self._init_threshold()
self._init_np_inputs_and_dout()
self._group = group
self.test_mode = test_mode

def _gen_eager_inputs_and_dout(self):
place = paddle.device.get_device()
x_eager = paddle.to_tensor(
self._np_x,
dtype="int64",
place=place,
)
x_eager.stop_gradient = False
table_eager = paddle.to_tensor(
self._np_table,
dtype=self._dtype if self._dtype != 'bfloat16' else "float32",
place=place,
)
table_eager.stop_gradient = False
dout_eager = paddle.to_tensor(
self._np_dout,
dtype=self._dtype if self._dtype != 'bfloat16' else "float32",
place=place,
)
dout_eager.stop_gradient = False
return x_eager, table_eager, dout_eager

def _cal_eager_res(self, x, table, dout):
x_t = x
table_t = table
dout_t = dout

if self._dtype == "float32":
table_t = paddle.cast(table, dtype="uint16")
dout_t = paddle.cast(dout, dtype="uint16")
table_t = paddle.cast(table, dtype="float32")
dout_t = paddle.cast(dout, dtype="float32")

if self._dtype == "bfloat16":
table_t = paddle.cast(table, dtype="uint16")
dout_t = paddle.cast(dout, dtype="uint16")

embedding = fleet.meta_parallel.VocabParallelEmbedding(self._num_embeddings, self._embedding_dim, mp_group=self._group)
paddle.assign(table_t, embedding.weight)
out = embedding(x_t)

out_grads = paddle.grad(
[out], [embedding.weight], grad_outputs=[dout_t]
)

out_grads = out_grads[0]

if self._dtype == "bfloat16":
out = paddle.cast(out, dtype="float32")
out_grads = paddle.cast(out_grads, dtype="float32")

if self.test_mode == 2 and self._dtype == "float32":
out = paddle.cast(out, dtype="uint16")
out_grads = paddle.cast(out_grads, dtype="uint16")
out = paddle.cast(out, dtype="float32")
out_grads = paddle.cast(out_grads, dtype="float32")

return out, out_grads

def _test_eager_accuracy(self):
x_eager, table_eager, dout_eager = self._gen_eager_inputs_and_dout()
out_eager, out_grads_eager = self._cal_eager_res(x_eager, table_eager, dout_eager)

del x_eager
del table_eager
del dout_eager
paddle.device.cuda.empty_cache()
out_eager_np = out_eager.numpy()
out_grads_eager_np = out_grads_eager.numpy()

global_out.append(out_eager_np)
global_dout.append(out_grads_eager_np)

del out_eager
del out_grads_eager
paddle.device.cuda.empty_cache()


dtype_list = ["float32", "bfloat16"]

dist_strategy = fleet.DistributedStrategy()
world_size = paddle_dist.get_world_size()
dist_strategy.hybrid_configs = {
"mp_degree": world_size,
"pp_degree": 1,
"dp_degree": 1,
}
rank = paddle_dist.get_rank()
paddle_dist.fleet.init(is_collective=True, strategy = dist_strategy)

set_random_seed(1024)

group = paddle_dist.collective._get_default_group()

for test_mode in [1,2]:
print("test_mode_{test_mode} start*************************************************************************" \
.format(test_mode=test_mode))

if test_mode == 1:
atol = 1e-2
elif test_mode == 2:
atol = 1e-6

global_out.clear()
global_dout.clear()
for dtype in dtype_list:

np_input_dir_forward = "./data/vpe-int64-bf16-bf16-eager_in_tmp_477-word_embedding_expanded_{rank}.w_0-eager_in_tmp_486-pp-0-mp-{rank}.npz".format(rank=rank)
np_input_dir_backward = "./data/vpe-int64-bf16-bf16-eager_in_tmp_477-word_embedding_expanded_{rank}.w_0-eager_in_tmp_486-pp-0-mp-{rank}.npy".format(rank=rank)

test_paddle = TestPaddle(group, test_mode, np_input_dir_forward, np_input_dir_backward, dtype)
test_paddle._test_eager_accuracy()

try:
np_assert_accuracy(
global_out[0],
global_out[1],
atol,
atol,
"fp32_vs_bf16",
version_a="fp32",
version_b="bf16",
eager_or_static_mode="eager",
fwd_or_bkd="forward",
api="fleet.meta_parallel.VocabParallelEmbedding",
)
except Exception as e:
print(e)

try:
np_assert_accuracy(
global_dout[0],
global_dout[1],
atol,
atol,
"fp32_vs_bf16",
version_a="fp32",
version_b="bf16",
eager_or_static_mode="eager",
fwd_or_bkd="backward",
api="fleet.meta_parallel.VocabParallelEmbedding",
)
except Exception as e:
print(e)
print("test_mode_{test_mode} end*************************************************************************" \
.format(test_mode=test_mode))

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import numpy as np
import sys
sys.path.append("../../..")
from utils import TOLERANCE, convert_dtype_to_torch_type

class InitConfigClass:
def __init__(self):
self._init_params()
self._init_threshold()
self._init_np_inputs_and_dout()

def _init_params(self, np_input_dir_forward="", np_input_dir_backward="", dtype=""):
self._np_input_dir_forward = np_input_dir_forward
self._np_input_dir_backward = np_input_dir_backward
self._dtype = dtype

def _init_threshold(self):
self._atol = TOLERANCE[self._dtype]["atol"]
self._rtol = TOLERANCE[self._dtype]["rtol"]

def _init_np_inputs_and_dout(self):
np_inputs_array_forward = np.load(self._np_input_dir_forward)
np_inputs_array_backward = np.load(self._np_input_dir_backward)
# get np array from npz file
self._np_x = np_inputs_array_forward["x"]
self._np_table = np_inputs_array_forward["weight"]
self._np_dout = np_inputs_array_backward
self._num_embeddings = np_inputs_array_forward['num_embeddings']
self._embedding_dim = np_inputs_array_forward['embedding_dim']
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import numpy as np
import init_config_class

dim_1_list = [31250]
dim_2_list = [8192]
dim_3_list = [14336]

card_num = 8

def generate_np_inputs_and_dout():

for i in range(8):
np.random.seed(0)

dim_1 = dim_1_list[0]
dim_2 = dim_2_list[0]
dim_3 = dim_3_list[0]
x_case1 = np.random.randint(low=0, high=dim_1, size=[1, dim_2]).astype("int64")
table_case1 = np.random.random(size=[dim_1, dim_3]).astype("float32") - 0.5
dout_case1 = np.random.random(size=[1, dim_2, dim_3]).astype("float32") - 0.5

np_input_dir_forward = "./data/vpe-int64-bf16-bf16-eager_in_tmp_477-word_embedding_expanded_{rank}.w_0-eager_in_tmp_486-pp-0-mp-{rank}.npz".format(rank=i)
np_input_dir_backward = "./data/vpe-int64-bf16-bf16-eager_in_tmp_477-word_embedding_expanded_{rank}.w_0-eager_in_tmp_486-pp-0-mp-{rank}.npy".format(rank=i)

np.savez(np_input_dir_forward, x = x_case1, weight = table_case1, num_embeddings=dim_1 * card_num, embedding_dim=dim_3)
np.save(np_input_dir_backward, dout_case1)


generate_np_inputs_and_dout()
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
set -ex

card_num=$1
version=$2
log_dir=$3

export NVIDIA_TF32_OVERRIDE=0

case $card_num in
2 ) export CUDA_VISIBLE_DEVICES=0,1 ;;
4 ) export CUDA_VISIBLE_DEVICES=0,1,2,3 ;;
8 ) export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ;;
* )
echo '请输入正确的卡数'
exit
;;
esac

#please run "bash test_vocab_parallel_embedding.sh * develop" first

if [ "$version" == 'develop' ]; then
# rm -rf data/*.npz
# python prepare_data.py
python -m paddle.distributed.launch --log_dir $log_dir get_and_test_paddle_result.py
else
echo "请输入develop或者incubate"
fi
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import init_config_class
import random
import sys
sys.path.append("../..")
sys.path.append("../../..")
from utils import (
np_assert_accuracy
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import numpy as np
import sys
sys.path.append("../..")
sys.path.append("../../..")
from utils import TOLERANCE, convert_dtype_to_torch_type

card_num = 8
Expand Down