From c1587812be2164897c1b7164bfd3300fef581175 Mon Sep 17 00:00:00 2001 From: guanlongjie Date: Thu, 31 Aug 2023 19:36:29 +0800 Subject: [PATCH 01/43] glm_config --- training/kunlunxin/README.md | 5 ++++- training/kunlunxin/docker_image/pytorch/Dockerfile | 2 +- training/kunlunxin/docker_image/pytorch/pytorch_install.sh | 6 +++--- training/kunlunxin/glm-pytorch/config/config_R300x1x8.py | 2 +- training/kunlunxin/glm-pytorch/config/requirements.txt | 3 +++ 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/training/kunlunxin/README.md b/training/kunlunxin/README.md index b3fc2d399..5818852d9 100644 --- a/training/kunlunxin/README.md +++ b/training/kunlunxin/README.md @@ -26,7 +26,10 @@ R480-X8基于多芯片间高速互联技术,单机可提供高达1 Peta Ops @F - OS版本:Ubuntu 20.04 - OS kernel版本: 5.4.0-26-generic - 加速卡驱动版本:4.0.25 - - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu18.04:v0.04 + - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 + - 训练框架版本: xmlir+111e7d45[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl) + - 训练编译器版本: xacc+111e7d45[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl) + - 依赖软件版本:pytorch-1.12.1+cpu ## 容器镜像信息 - 容器构建信息 diff --git a/training/kunlunxin/docker_image/pytorch/Dockerfile b/training/kunlunxin/docker_image/pytorch/Dockerfile index b54597d86..2c5cb464d 100644 --- a/training/kunlunxin/docker_image/pytorch/Dockerfile +++ b/training/kunlunxin/docker_image/pytorch/Dockerfile @@ -1,3 +1,3 @@ # TODO: this is a temporary docker image from Docker Hub, need update to kunlunxin's Harbor registry. -from dynamicheart/pytorch1.12.1-cpu-ubuntu18.04:v0.04 +from dynamicheart/pytorch1.12.1-cpu-ubuntu20.04:v0.01 RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple" diff --git a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh index 3db86efae..f6a97305d 100644 --- a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh +++ b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh @@ -2,6 +2,6 @@ set -xe -wget https://klx-public.bj.bcebos.com/xmlir/flagopen/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl -O ~/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl -pip install ~/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl -rm ~/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl + +pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl +pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py index add30cda7..5437e614d 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py @@ -2,7 +2,7 @@ fp16 = False train_batch_size = 4 -eval_batch_size = 6 +eval_batch_size = 4 dist_backend = "xccl" diff --git a/training/kunlunxin/glm-pytorch/config/requirements.txt b/training/kunlunxin/glm-pytorch/config/requirements.txt index 3adfcca6c..0f842cdff 100644 --- a/training/kunlunxin/glm-pytorch/config/requirements.txt +++ b/training/kunlunxin/glm-pytorch/config/requirements.txt @@ -1,3 +1,6 @@ h5sparse boto3 h5py +numpy>=1.15.4 +sentencepiece>=0.1.8 +jieba From 3f6afe780ed8a9ff9c5e98511fc19343ca6016c6 Mon Sep 17 00:00:00 2001 From: GGuanl Date: Fri, 1 Sep 2023 10:59:22 +0800 Subject: [PATCH 02/43] fix_#1 --- training/kunlunxin/glm-pytorch/config/config_R300x1x1.py | 2 +- training/kunlunxin/glm-pytorch/config/config_R300x2x8.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py index add30cda7..5437e614d 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py @@ -2,7 +2,7 @@ fp16 = False train_batch_size = 4 -eval_batch_size = 6 +eval_batch_size = 4 dist_backend = "xccl" diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py index add30cda7..5437e614d 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py @@ -2,7 +2,7 @@ fp16 = False train_batch_size = 4 -eval_batch_size = 6 +eval_batch_size = 4 dist_backend = "xccl" From c48bd1b0f5b27a89d444df459d09a622da0a8c40 Mon Sep 17 00:00:00 2001 From: GGuanl Date: Wed, 6 Sep 2023 14:28:50 +0800 Subject: [PATCH 03/43] glm-config_updated --- training/kunlunxin/glm-pytorch/config/config_R300x1x8.py | 4 ++-- .../kunlunxin/glm-pytorch/config/environment_variables.sh | 5 +++++ training/run_benchmarks/config/cluster_conf.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py index 5437e614d..c38fadf89 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py @@ -1,8 +1,8 @@ vendor = 'kunlunxin' fp16 = False -train_batch_size = 4 -eval_batch_size = 4 +train_batch_size = 5 +eval_batch_size = 5 dist_backend = "xccl" diff --git a/training/kunlunxin/glm-pytorch/config/environment_variables.sh b/training/kunlunxin/glm-pytorch/config/environment_variables.sh index 8a0dfcd75..a97c7d96c 100755 --- a/training/kunlunxin/glm-pytorch/config/environment_variables.sh +++ b/training/kunlunxin/glm-pytorch/config/environment_variables.sh @@ -7,6 +7,11 @@ export BKCL_TIMEOUT=1800 # when using tree allreduce, the number of nodes must be a multiple of 2 export BKCL_SOCKET_FORCE_TREE=1 +export XMLIR_D_XPU_L3_SIZE=66060288 + +export BKCL_PCIE_RING=1 +export BKCL_FORCE_SYNC=1 + export ALLREDUCE_ASYNC=false export ALLREDUCE_FUSION=0 diff --git a/training/run_benchmarks/config/cluster_conf.py b/training/run_benchmarks/config/cluster_conf.py index be628e197..ea6e7c901 100644 --- a/training/run_benchmarks/config/cluster_conf.py +++ b/training/run_benchmarks/config/cluster_conf.py @@ -1,7 +1,7 @@ '''Cluster configs''' # Hosts to run the benchmark. Each item is an IP address or a hostname. -HOSTS = ["10.1.2.2", "10.1.2.3", "10.1.2.4"] +HOSTS = ["xx.xx.xx.xx"] # Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored' HOSTS_PORTS = ["2222"] From acbf4b657b1cbebae846783b80a0192fac6a1ef7 Mon Sep 17 00:00:00 2001 From: GGuanl Date: Wed, 6 Sep 2023 15:15:05 +0800 Subject: [PATCH 04/43] glm-config-updated#2 --- .../kunlunxin/glm-pytorch/config/environment_variables.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/config/environment_variables.sh b/training/kunlunxin/glm-pytorch/config/environment_variables.sh index a97c7d96c..9c9f20b8e 100755 --- a/training/kunlunxin/glm-pytorch/config/environment_variables.sh +++ b/training/kunlunxin/glm-pytorch/config/environment_variables.sh @@ -7,9 +7,9 @@ export BKCL_TIMEOUT=1800 # when using tree allreduce, the number of nodes must be a multiple of 2 export BKCL_SOCKET_FORCE_TREE=1 -export XMLIR_D_XPU_L3_SIZE=66060288 +export XMLIR_D_XPU_L3_SIZE=32505856 -export BKCL_PCIE_RING=1 +export BKCL_CCIX_RING=1 export BKCL_FORCE_SYNC=1 export ALLREDUCE_ASYNC=false From 53b099996f9422efd49c449fdfd695eb1171b438 Mon Sep 17 00:00:00 2001 From: GGuanl Date: Wed, 6 Sep 2023 15:17:16 +0800 Subject: [PATCH 05/43] glm_config-updated#2 --- training/run_benchmarks/config/cluster_conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/run_benchmarks/config/cluster_conf.py b/training/run_benchmarks/config/cluster_conf.py index ea6e7c901..be628e197 100644 --- a/training/run_benchmarks/config/cluster_conf.py +++ b/training/run_benchmarks/config/cluster_conf.py @@ -1,7 +1,7 @@ '''Cluster configs''' # Hosts to run the benchmark. Each item is an IP address or a hostname. -HOSTS = ["xx.xx.xx.xx"] +HOSTS = ["10.1.2.2", "10.1.2.3", "10.1.2.4"] # Hosts port to run the tensorflow distribution_strategy = 'multi_worker_mirrored' HOSTS_PORTS = ["2222"] From dd3c47875acbeb4a94d9dd7add1b9441009e1aae Mon Sep 17 00:00:00 2001 From: GGuanl Date: Wed, 6 Sep 2023 16:03:02 +0800 Subject: [PATCH 06/43] glm_config-#2 --- training/kunlunxin/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/README.md b/training/kunlunxin/README.md index 5818852d9..2abcfa4a0 100644 --- a/training/kunlunxin/README.md +++ b/training/kunlunxin/README.md @@ -27,8 +27,8 @@ R480-X8基于多芯片间高速互联技术,单机可提供高达1 Peta Ops @F - OS kernel版本: 5.4.0-26-generic - 加速卡驱动版本:4.0.25 - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 - - 训练框架版本: xmlir+111e7d45[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl) - - 训练编译器版本: xacc+111e7d45[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl) + - 训练框架版本: xmlir+111e7d45[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/111e7d45/xacc-0.1.0-cp38-cp38-linux_x86_64.whl) + - 训练编译器版本: xacc+111e7d45[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/111e7d45/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl) - 依赖软件版本:pytorch-1.12.1+cpu ## 容器镜像信息 From 7659bea251ec27c75bc5a0261b37b095c713b02b Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Wed, 6 Sep 2023 18:41:17 +0800 Subject: [PATCH 07/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index 091debba4..d3b724d9e 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -14,8 +14,9 @@ - OS版本:Ubuntu 20.04 - OS kernel版本: 5.4.0-26-generic - 加速卡驱动版本:4.0.25 - - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu18.04:v0.04 - - 训练框架版本:xmlir+e70db8f6 + - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 + - 训练框架版本:xmlir+111e7d45 [xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl) + - 训练编译器版本:xacc+111e7d45 [xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xacc-0.1.0-cp38-cp38-linux_x86_64.whl) - 依赖软件版本:pytorch-1.12.1+cpu ### 测试运行方法 From f953f4e411075c117eb4ba0ea7911079990a4dba Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Wed, 6 Sep 2023 18:42:32 +0800 Subject: [PATCH 08/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index d3b724d9e..e0dd73604 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -15,8 +15,8 @@ - OS kernel版本: 5.4.0-26-generic - 加速卡驱动版本:4.0.25 - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 - - 训练框架版本:xmlir+111e7d45 [xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl) - - 训练编译器版本:xacc+111e7d45 [xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xacc-0.1.0-cp38-cp38-linux_x86_64.whl) + - 训练框架版本:xmlir+111e7d45 [xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl) + - 训练编译器版本:xacc+111e7d45 [xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xacc-0.1.0-cp38-cp38-linux_x86_64.whl) - 依赖软件版本:pytorch-1.12.1+cpu ### 测试运行方法 From 04d5bd9a035fd52c5cda579987ec2e2cb664cb63 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Wed, 6 Sep 2023 18:43:29 +0800 Subject: [PATCH 09/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index e0dd73604..b638eba88 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -15,8 +15,8 @@ - OS kernel版本: 5.4.0-26-generic - 加速卡驱动版本:4.0.25 - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 - - 训练框架版本:xmlir+111e7d45 [xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl) - - 训练编译器版本:xacc+111e7d45 [xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xacc-0.1.0-cp38-cp38-linux_x86_64.whl) + - 训练框架版本:xmlir+111e7d45 【[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl)】 + - 训练编译器版本:xacc+111e7d45 【[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xacc-0.1.0-cp38-cp38-linux_x86_64.whl)】 - 依赖软件版本:pytorch-1.12.1+cpu ### 测试运行方法 From c038ab7a9eaf2bafa821e73a0d09c61dfe669a5b Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Wed, 6 Sep 2023 18:44:40 +0800 Subject: [PATCH 10/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 65 ++++++++++-------------- 1 file changed, 27 insertions(+), 38 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index b638eba88..cca6433ad 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -19,41 +19,30 @@ - 训练编译器版本:xacc+111e7d45 【[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/archives/111e7d45/xacc-0.1.0-cp38-cp38-linux_x86_64.whl)】 - 依赖软件版本:pytorch-1.12.1+cpu -### 测试运行方法 - -修改`FlagPerf/training/run_benchmarks/config/test_conf.py`文件里的配置项: - -```python -VENDOR = "kunlunxin" - -ACCE_CONTAINER_OPT = " --device=/dev/xpu0 --device=/dev/xpu1 --device=/dev/xpu2" + \ - " --device=/dev/xpu3 --device=/dev/xpu4 --device=/dev/xpu5" + \ - " --device=/dev/xpu6 --device=/dev/xpu7 --device=/dev/xpuctrl" - -ACCE_VISIBLE_DEVICE_ENV_NAME = "XPU_VISIBLE_DEVICES" - -CASES = [ - "GLM_TORCH_DEMO_R300_1X1", - "GLM_TORCH_DEMO_R300_1X2", - "GLM_TORCH_DEMO_R300_1X4", - "GLM_TORCH_DEMO_R300_1X8", - "GLM_TORCH_DEMO_R300_2X8" -] -``` - -剩余步骤按照项目根目录文档下的[“快速启动”](../../../README.md#快速启动)章节进行。 - - -### 运行情况参考 - -| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) | -|---------| --------------- | ----------- | -------- | -------- | ------- | ---------------- | -| 单机1卡 | config_R300x1x1 | 121371.25| 0.8 | 0.8021 | 14400(fp32)| 0.50 | -| 单机2卡 | config_R300x1x2 | 106709.60| 0.8 | 0.8085 | 12000(fp32)| 0.92 | -| 单机4卡 | config_R300x1x4 | 44162.12 | 0.8 | 0.8027 | 4800(fp32) | 1.79 | -| 单机8卡 | config_R300x1x8 | 22902.82 | 0.8 | 0.8003 | 2400(fp32) | 3.47 | -| 两机8卡 | config_R300x2x8 | 16217.80 | 0.8 | 0.8012 | 1500(fp32) | 6.08 | - -### 许可证 - -Apache 2.0 license。 +#### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ------------------------------ | ------------------------------------------- | +| 任务类别 | 文本分类、文本生成 | | +| 模型 | cpm | | +| 数据集 | CPM-Finetune-data | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | R300 | | +| 硬件存储使用 | mem(actual/total),见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | acc,见“性能指标” | 分类准确率(mlm_accuracy) | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | +| ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- | +| R300单机单卡(1x1) | fp16 | bs=64,lr=0.0005 | | | | | | | +| R300单机8卡(1x8) | fp16 | bs=64,lr=0.0005 | | | | | 0.9261| 18.25/32.0 | +| R300两机8卡(2x8) | fp16 | bs=64,lr=0.0005 | | | | | | | From 046fdf7f9fffad6a1e6135c29442d26725c22570 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Wed, 6 Sep 2023 18:51:44 +0800 Subject: [PATCH 11/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index cca6433ad..f5f43e8c6 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -25,7 +25,7 @@ | 指标名称 | 指标值 | 特殊说明 | | -------------- | ------------------------------ | ------------------------------------------- | -| 任务类别 | 文本分类、文本生成 | | +| 任务类别 | 通用语言模型 | | | 模型 | cpm | | | 数据集 | CPM-Finetune-data | | | 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | From 6b6fd85f59466ccacfdad9b715e04eff3463ce96 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Wed, 6 Sep 2023 18:52:21 +0800 Subject: [PATCH 12/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index f5f43e8c6..7884ae6c8 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -26,8 +26,8 @@ | 指标名称 | 指标值 | 特殊说明 | | -------------- | ------------------------------ | ------------------------------------------- | | 任务类别 | 通用语言模型 | | -| 模型 | cpm | | -| 数据集 | CPM-Finetune-data | | +| 模型 | glm | | +| 数据集 | ReCord | | | 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | | 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | | 硬件设备简称 | R300 | | From 4f998e3e56a634c82ada9caa1420eb8075d8fb8b Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Wed, 6 Sep 2023 18:52:50 +0800 Subject: [PATCH 13/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index 7884ae6c8..bacaddfd1 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -27,7 +27,7 @@ | -------------- | ------------------------------ | ------------------------------------------- | | 任务类别 | 通用语言模型 | | | 模型 | glm | | -| 数据集 | ReCord | | +| 数据集 | ReCoRD | | | 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | | 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | | 硬件设备简称 | R300 | | From 629b37aff9cf9f232899374fa4e972952b4ccd0e Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Wed, 6 Sep 2023 19:06:10 +0800 Subject: [PATCH 14/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index bacaddfd1..fa9baedac 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -43,6 +43,6 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | | ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- | -| R300单机单卡(1x1) | fp16 | bs=64,lr=0.0005 | | | | | | | -| R300单机8卡(1x8) | fp16 | bs=64,lr=0.0005 | | | | | 0.9261| 18.25/32.0 | -| R300两机8卡(2x8) | fp16 | bs=64,lr=0.0005 | | | | | | | +| R300单机单卡(1x1) | fp32 | bs=5,lr=1e-05 | | | | | | | +| R300单机8卡(1x8) | fp32 | bs=5,lr=1e-05 | 30764 | 3.21 | 3.64 | 3.646 | 80.52%| 31.8/32.0 | +| R300两机8卡(2x8) | fp32 | bs=5,lr=1e-05 | | | | | | | From be5eb372d0c3b380c2eba37a8040bed83534a967 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Wed, 6 Sep 2023 19:27:47 +0800 Subject: [PATCH 15/43] Update pytorch_install.sh --- training/kunlunxin/docker_image/pytorch/pytorch_install.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh index f6a97305d..850a304b4 100644 --- a/training/kunlunxin/docker_image/pytorch/pytorch_install.sh +++ b/training/kunlunxin/docker_image/pytorch/pytorch_install.sh @@ -2,6 +2,5 @@ set -xe - pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl pip install https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl From 22eeefc805213bd0de3a46d5baa349e7f71d4240 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Wed, 6 Sep 2023 19:56:12 +0800 Subject: [PATCH 16/43] Create config_common --- .../kunlunxin/glm-pytorch/config/config_common | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 training/kunlunxin/glm-pytorch/config/config_common diff --git a/training/kunlunxin/glm-pytorch/config/config_common b/training/kunlunxin/glm-pytorch/config/config_common new file mode 100644 index 000000000..b152bcebd --- /dev/null +++ b/training/kunlunxin/glm-pytorch/config/config_common @@ -0,0 +1,18 @@ +vendor = 'kunlunxin' +fp16 = False + +dist_backend = "xccl" + +lr = 1e-5 +weight_decay = 0.1 +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-08 +gradient_accumulation_steps = 1 +warmup = 0.1 +lr_decay_ratio = 0.1 +lr_decay_iters = 4338 +log_freq = 1 +seed = 4096 +max_samples_termination = 5553080 +training_event = None From 0dee798309a479adf1b7b24a72b91a9db94c36e8 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Wed, 6 Sep 2023 19:58:24 +0800 Subject: [PATCH 17/43] Update README.md --- training/kunlunxin/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/README.md b/training/kunlunxin/README.md index 2abcfa4a0..e41ab34be 100644 --- a/training/kunlunxin/README.md +++ b/training/kunlunxin/README.md @@ -27,8 +27,8 @@ R480-X8基于多芯片间高速互联技术,单机可提供高达1 Peta Ops @F - OS kernel版本: 5.4.0-26-generic - 加速卡驱动版本:4.0.25 - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 - - 训练框架版本: xmlir+111e7d45[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/111e7d45/xacc-0.1.0-cp38-cp38-linux_x86_64.whl) - - 训练编译器版本: xacc+111e7d45[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/111e7d45/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl) + - 训练框架版本: xmlir+111e7d45 【[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/111e7d45/xacc-0.1.0-cp38-cp38-linux_x86_64.whl)】 + - 训练编译器版本: xacc+111e7d45 【[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/111e7d45/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl)】 - 依赖软件版本:pytorch-1.12.1+cpu ## 容器镜像信息 From c2f993f6f1b78ca6fc2e6c5dea370478bde87667 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 09:40:42 +0800 Subject: [PATCH 18/43] Rename config_common to config_common.py --- .../glm-pytorch/config/{config_common => config_common.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename training/kunlunxin/glm-pytorch/config/{config_common => config_common.py} (100%) diff --git a/training/kunlunxin/glm-pytorch/config/config_common b/training/kunlunxin/glm-pytorch/config/config_common.py similarity index 100% rename from training/kunlunxin/glm-pytorch/config/config_common rename to training/kunlunxin/glm-pytorch/config/config_common.py From ca34bb6026a4c0231516a271275231a5752402c9 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 10:07:36 +0800 Subject: [PATCH 19/43] Update config_R300x2x8.py --- .../glm-pytorch/config/config_R300x2x8.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py index 5437e614d..efa17085f 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py @@ -1,21 +1,2 @@ -vendor = 'kunlunxin' -fp16 = False - train_batch_size = 4 eval_batch_size = 4 - -dist_backend = "xccl" - -lr = 1e-5 -weight_decay = 0.1 -adam_beta1 = 0.9 -adam_beta2 = 0.999 -adam_eps = 1e-08 -gradient_accumulation_steps = 1 -warmup = 0.1 -lr_decay_ratio = 0.1 -lr_decay_iters = 4338 -log_freq = 1 -seed = 4096 -max_samples_termination = 5553080 -training_event = None From a9d02b8b568666603ff8cf58839c2b806474be28 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 10:18:04 +0800 Subject: [PATCH 20/43] Update config_R300x1x1.py --- .../glm-pytorch/config/config_R300x1x1.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py index 5437e614d..efa17085f 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py @@ -1,21 +1,2 @@ -vendor = 'kunlunxin' -fp16 = False - train_batch_size = 4 eval_batch_size = 4 - -dist_backend = "xccl" - -lr = 1e-5 -weight_decay = 0.1 -adam_beta1 = 0.9 -adam_beta2 = 0.999 -adam_eps = 1e-08 -gradient_accumulation_steps = 1 -warmup = 0.1 -lr_decay_ratio = 0.1 -lr_decay_iters = 4338 -log_freq = 1 -seed = 4096 -max_samples_termination = 5553080 -training_event = None From 972ed9f51163800efc94a60505bca41db1f25464 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 10:18:33 +0800 Subject: [PATCH 21/43] Update config_R300x1x8.py --- .../glm-pytorch/config/config_R300x1x8.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py index c38fadf89..6303ad4e8 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py @@ -1,21 +1,2 @@ -vendor = 'kunlunxin' -fp16 = False - train_batch_size = 5 eval_batch_size = 5 - -dist_backend = "xccl" - -lr = 1e-5 -weight_decay = 0.1 -adam_beta1 = 0.9 -adam_beta2 = 0.999 -adam_eps = 1e-08 -gradient_accumulation_steps = 1 -warmup = 0.1 -lr_decay_ratio = 0.1 -lr_decay_iters = 4338 -log_freq = 1 -seed = 4096 -max_samples_termination = 5553080 -training_event = None From f1714c50e5015d9e5ec1abdda2d7a12c246d3583 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 10:20:50 +0800 Subject: [PATCH 22/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index fa9baedac..6b942e797 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -44,5 +44,5 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | | ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- | | R300单机单卡(1x1) | fp32 | bs=5,lr=1e-05 | | | | | | | -| R300单机8卡(1x8) | fp32 | bs=5,lr=1e-05 | 30764 | 3.21 | 3.64 | 3.646 | 80.52%| 31.8/32.0 | +| R300单机8卡(1x8) | fp32 | bs=5,lr=1e-05 | | | | | 80.52%| 31.8/32.0 | | R300两机8卡(2x8) | fp32 | bs=5,lr=1e-05 | | | | | | | From ace9fea668596df43bde6434503509e09e4a151c Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 10:22:21 +0800 Subject: [PATCH 23/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index 6b942e797..3570832e3 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -43,6 +43,6 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | | ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- | -| R300单机单卡(1x1) | fp32 | bs=5,lr=1e-05 | | | | | | | +| R300单机单卡(1x1) | fp32 | bs=5,lr=1e-05 | | | | | | | | R300单机8卡(1x8) | fp32 | bs=5,lr=1e-05 | | | | | 80.52%| 31.8/32.0 | | R300两机8卡(2x8) | fp32 | bs=5,lr=1e-05 | | | | | | | From c474b636c890e5fbf440c946d5cdc742052dcf73 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 10:22:41 +0800 Subject: [PATCH 24/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index 3570832e3..835d73f9f 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -43,6 +43,6 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | | ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- | -| R300单机单卡(1x1) | fp32 | bs=5,lr=1e-05 | | | | | | | +| R300单机单卡(1x1) | fp32 | bs=4,lr=1e-05 | | | | | | | | R300单机8卡(1x8) | fp32 | bs=5,lr=1e-05 | | | | | 80.52%| 31.8/32.0 | -| R300两机8卡(2x8) | fp32 | bs=5,lr=1e-05 | | | | | | | +| R300两机8卡(2x8) | fp32 | bs=4,lr=1e-05 | | | | | | | From 9e308091343735bd36036f7b2df34e1b5bb8304b Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:21:33 +0800 Subject: [PATCH 25/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index 835d73f9f..0f9580340 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -45,4 +45,4 @@ | ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- | | R300单机单卡(1x1) | fp32 | bs=4,lr=1e-05 | | | | | | | | R300单机8卡(1x8) | fp32 | bs=5,lr=1e-05 | | | | | 80.52%| 31.8/32.0 | -| R300两机8卡(2x8) | fp32 | bs=4,lr=1e-05 | | | | | | | +| R300两机8卡(2x8) | fp32 | bs=4,lr=1e-05 | | | | | 80.31% | 31.7/32.0 | From 015a751b99815d95a5de235deddde8921ad3919f Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:22:07 +0800 Subject: [PATCH 26/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index 0f9580340..88dfe97c6 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -43,6 +43,6 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | | ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- | -| R300单机单卡(1x1) | fp32 | bs=4,lr=1e-05 | | | | | | | +| R300单机单卡(1x1) | fp32 | bs=5,lr=1e-05 | | | | | | | | R300单机8卡(1x8) | fp32 | bs=5,lr=1e-05 | | | | | 80.52%| 31.8/32.0 | | R300两机8卡(2x8) | fp32 | bs=4,lr=1e-05 | | | | | 80.31% | 31.7/32.0 | From 60dee8a76dc223c6ebb8dc05965d0dcd0c83bfef Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 11:23:37 +0800 Subject: [PATCH 27/43] Update requirements.txt --- training/kunlunxin/glm-pytorch/config/requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/config/requirements.txt b/training/kunlunxin/glm-pytorch/config/requirements.txt index 0f842cdff..8bac0066c 100644 --- a/training/kunlunxin/glm-pytorch/config/requirements.txt +++ b/training/kunlunxin/glm-pytorch/config/requirements.txt @@ -2,5 +2,3 @@ h5sparse boto3 h5py numpy>=1.15.4 -sentencepiece>=0.1.8 -jieba From 32eb6f1cf3c9e70157e23eb2469ab31547bccf8d Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 12:27:09 +0800 Subject: [PATCH 28/43] Update README.md --- training/kunlunxin/glm-pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/README.md b/training/kunlunxin/glm-pytorch/README.md index 88dfe97c6..bd5ae9d10 100644 --- a/training/kunlunxin/glm-pytorch/README.md +++ b/training/kunlunxin/glm-pytorch/README.md @@ -43,6 +43,6 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | | ------------------- | --------- | ---------------- | -------- | ------- | ------- | ------ | ----- | --------- | -| R300单机单卡(1x1) | fp32 | bs=5,lr=1e-05 | | | | | | | +| R300单机单卡(1x1) | fp32 | bs=4,lr=1e-05 | | | | | 80.52% | 31.8/32.0 | | R300单机8卡(1x8) | fp32 | bs=5,lr=1e-05 | | | | | 80.52%| 31.8/32.0 | | R300两机8卡(2x8) | fp32 | bs=4,lr=1e-05 | | | | | 80.31% | 31.7/32.0 | From 9d81ff13da97efde2ae18a99a54f4b05d2705205 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:37:10 +0800 Subject: [PATCH 29/43] Update config_R300x1x1.py --- training/kunlunxin/glm-pytorch/config/config_R300x1x1.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py index efa17085f..017011698 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py @@ -1,2 +1,4 @@ +from config_common import* + train_batch_size = 4 eval_batch_size = 4 From 795299323a42e0ad86e4939c4aa956961827d445 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:37:22 +0800 Subject: [PATCH 30/43] Update config_R300x1x8.py --- training/kunlunxin/glm-pytorch/config/config_R300x1x8.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py index 6303ad4e8..9f1a38d33 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py @@ -1,2 +1,4 @@ +from config_common import* + train_batch_size = 5 eval_batch_size = 5 From 27b48b59f42f00394c35d784487677c85e7e3ae7 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:37:35 +0800 Subject: [PATCH 31/43] Update config_R300x2x8.py --- training/kunlunxin/glm-pytorch/config/config_R300x2x8.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py index efa17085f..017011698 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py @@ -1,2 +1,4 @@ +from config_common import* + train_batch_size = 4 eval_batch_size = 4 From c0e3ab4f20cd3e5cdc6e41bd4756cb715f304cd5 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:39:34 +0800 Subject: [PATCH 32/43] Update config_R300x1x1.py --- .../kunlunxin/glm-pytorch/config/config_R300x1x1.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py index 017011698..434dc4afb 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py @@ -2,3 +2,16 @@ train_batch_size = 4 eval_batch_size = 4 + +lr = 1e-5 +weight_decay = 0.1 +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-08 +gradient_accumulation_steps = 1 +warmup = 0.1 +lr_decay_ratio = 0.1 +lr_decay_iters = 4338 +log_freq = 1 +seed = 4096 +max_samples_termination = 5553080 From 73943523fb7bceb7be6559f48a72da954efb28ea Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:39:45 +0800 Subject: [PATCH 33/43] Update config_R300x1x8.py --- .../kunlunxin/glm-pytorch/config/config_R300x1x8.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py index 9f1a38d33..fc38d9c9c 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py @@ -2,3 +2,16 @@ train_batch_size = 5 eval_batch_size = 5 + +lr = 1e-5 +weight_decay = 0.1 +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-08 +gradient_accumulation_steps = 1 +warmup = 0.1 +lr_decay_ratio = 0.1 +lr_decay_iters = 4338 +log_freq = 1 +seed = 4096 +max_samples_termination = 5553080 From 186ae5de861d4d146a4b144c2aaf6c0c87389c62 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:39:56 +0800 Subject: [PATCH 34/43] Update config_R300x2x8.py --- .../kunlunxin/glm-pytorch/config/config_R300x2x8.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py index 017011698..434dc4afb 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py @@ -2,3 +2,16 @@ train_batch_size = 4 eval_batch_size = 4 + +lr = 1e-5 +weight_decay = 0.1 +adam_beta1 = 0.9 +adam_beta2 = 0.999 +adam_eps = 1e-08 +gradient_accumulation_steps = 1 +warmup = 0.1 +lr_decay_ratio = 0.1 +lr_decay_iters = 4338 +log_freq = 1 +seed = 4096 +max_samples_termination = 5553080 From 2f204f5fbdaca0d475bd2e9e510e06e5494becba Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:40:15 +0800 Subject: [PATCH 35/43] Update config_common.py --- .../kunlunxin/glm-pytorch/config/config_common.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/training/kunlunxin/glm-pytorch/config/config_common.py b/training/kunlunxin/glm-pytorch/config/config_common.py index b152bcebd..15f7d8476 100644 --- a/training/kunlunxin/glm-pytorch/config/config_common.py +++ b/training/kunlunxin/glm-pytorch/config/config_common.py @@ -3,16 +3,4 @@ dist_backend = "xccl" -lr = 1e-5 -weight_decay = 0.1 -adam_beta1 = 0.9 -adam_beta2 = 0.999 -adam_eps = 1e-08 -gradient_accumulation_steps = 1 -warmup = 0.1 -lr_decay_ratio = 0.1 -lr_decay_iters = 4338 -log_freq = 1 -seed = 4096 -max_samples_termination = 5553080 training_event = None From 26acd75faadec786ad622bd35b55c405559efc2c Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:52:18 +0800 Subject: [PATCH 36/43] Update config_R300x1x1.py --- training/kunlunxin/glm-pytorch/config/config_R300x1x1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py index 434dc4afb..de50ccbba 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py @@ -14,4 +14,4 @@ lr_decay_iters = 4338 log_freq = 1 seed = 4096 -max_samples_termination = 5553080 +max_samples_termination = 925510 From 13cc7beffdc17b45b23a2c8ae57fae39622d8fbe Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 13:53:43 +0800 Subject: [PATCH 37/43] Update config_R300x2x8.py --- training/kunlunxin/glm-pytorch/config/config_R300x2x8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py index 434dc4afb..5eb0448a1 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py @@ -14,4 +14,4 @@ lr_decay_iters = 4338 log_freq = 1 seed = 4096 -max_samples_termination = 5553080 +max_samples_termination = 2776540 From d207087bf378a1a0dcde787bcbb7ced444659dcb Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:10:28 +0800 Subject: [PATCH 38/43] Update README.md --- training/kunlunxin/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/README.md b/training/kunlunxin/README.md index e41ab34be..e2a1bb2e9 100644 --- a/training/kunlunxin/README.md +++ b/training/kunlunxin/README.md @@ -27,8 +27,8 @@ R480-X8基于多芯片间高速互联技术,单机可提供高达1 Peta Ops @F - OS kernel版本: 5.4.0-26-generic - 加速卡驱动版本:4.0.25 - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 - - 训练框架版本: xmlir+111e7d45 【[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/111e7d45/xacc-0.1.0-cp38-cp38-linux_x86_64.whl)】 - - 训练编译器版本: xacc+111e7d45 【[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/111e7d45/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl)】 + - 训练框架版本: xmlir+111e7d45 【[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl)】 + - 训练编译器版本: xacc+111e7d45 【[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl)】 - 依赖软件版本:pytorch-1.12.1+cpu ## 容器镜像信息 From 27033bac877c0db2db6c10783d92a07d395f24fe Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:12:41 +0800 Subject: [PATCH 39/43] Update README.md --- training/kunlunxin/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/README.md b/training/kunlunxin/README.md index e2a1bb2e9..07b452931 100644 --- a/training/kunlunxin/README.md +++ b/training/kunlunxin/README.md @@ -27,8 +27,8 @@ R480-X8基于多芯片间高速互联技术,单机可提供高达1 Peta Ops @F - OS kernel版本: 5.4.0-26-generic - 加速卡驱动版本:4.0.25 - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 - - 训练框架版本: xmlir+111e7d45 【[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl)】 - - 训练编译器版本: xacc+111e7d45 【[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl)】 + - 训练框架版本: xmlir 【[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl)】 + - 训练编译器版本: xacc 【[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl)】 - 依赖软件版本:pytorch-1.12.1+cpu ## 容器镜像信息 From a44db3fbd7480fc515743c1b32c6860bfda47b39 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:16:24 +0800 Subject: [PATCH 40/43] Update README.md --- training/kunlunxin/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/kunlunxin/README.md b/training/kunlunxin/README.md index 07b452931..c8eb6dbc0 100644 --- a/training/kunlunxin/README.md +++ b/training/kunlunxin/README.md @@ -27,8 +27,8 @@ R480-X8基于多芯片间高速互联技术,单机可提供高达1 Peta Ops @F - OS kernel版本: 5.4.0-26-generic - 加速卡驱动版本:4.0.25 - Docker镜像和版本:pytorch1.12.1-cpu-ubuntu20.04:v0.01 - - 训练框架版本: xmlir 【[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl)】 - - 训练编译器版本: xacc 【[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl)】 + - 训练框架版本: xmlir 【[xmlir下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xacc-0.1.0-cp38-cp38-linux_x86_64.whl)】 + - 训练编译器版本: xacc 【[xacc下载](https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/latest/xmlir-0.0.1-cp38-cp38-linux_x86_64.whl)】 - 依赖软件版本:pytorch-1.12.1+cpu ## 容器镜像信息 From bce5f616028a77d9e5a82fb3fe23e4993801f352 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:19:48 +0800 Subject: [PATCH 41/43] Update config_R300x1x1.py --- training/kunlunxin/glm-pytorch/config/config_R300x1x1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py index de50ccbba..8a4c96915 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x1.py @@ -1,4 +1,4 @@ -from config_common import* +from config_common import * train_batch_size = 4 eval_batch_size = 4 From 42f761d8e596f32f4b770591851925943b1cbfb2 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:20:01 +0800 Subject: [PATCH 42/43] Update config_R300x1x8.py --- training/kunlunxin/glm-pytorch/config/config_R300x1x8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py index fc38d9c9c..7e1db7a25 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x1x8.py @@ -1,4 +1,4 @@ -from config_common import* +from config_common import * train_batch_size = 5 eval_batch_size = 5 From cda8284f0c89c249da0fdf223e1bbe8f0299fbb0 Mon Sep 17 00:00:00 2001 From: GGuanl <143151018+GGuanl@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:20:11 +0800 Subject: [PATCH 43/43] Update config_R300x2x8.py --- training/kunlunxin/glm-pytorch/config/config_R300x2x8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py index 5eb0448a1..840259660 100644 --- a/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py +++ b/training/kunlunxin/glm-pytorch/config/config_R300x2x8.py @@ -1,4 +1,4 @@ -from config_common import* +from config_common import * train_batch_size = 4 eval_batch_size = 4