From 53a8d960264ba200425f14147ce75e73ef84a526 Mon Sep 17 00:00:00 2001 From: xuedinge233 Date: Fri, 30 May 2025 02:52:19 +0000 Subject: [PATCH 1/6] Add workflow for torchtune --- .github/workflows/_ascend_npu_torchtune.yml | 128 ++++++++++++++++++++ .github/workflows/ascend_npu_test.yml | 53 ++++++++ 2 files changed, 181 insertions(+) create mode 100644 .github/workflows/_ascend_npu_torchtune.yml diff --git a/.github/workflows/_ascend_npu_torchtune.yml b/.github/workflows/_ascend_npu_torchtune.yml new file mode 100644 index 0000000..9562cb7 --- /dev/null +++ b/.github/workflows/_ascend_npu_torchtune.yml @@ -0,0 +1,128 @@ +name: "_ascend_npu_torchtune" + +on: + workflow_call: + inputs: + runner: + required: true + type: string + description: "The runner selected to run on" + image: + required: true + type: string + description: "The docker image which will be loaded" + device: + required: true + type: string + description: "The device selected to run on" + torch-artifact: + required: false + type: string + description: "The distribution artifact name of torch" + torch-npu-artifact: + required: true + type: string + description: "The distribution artifact name of torch_npu" + +defaults: + run: + shell: bash -el {0} + +jobs: + torchtune: + name: run torchtune for torch_npu + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.image }} + env: + HF_ENDPOINT: https://hf-mirror.com + + steps: + - name: Show NPU info + run: | + npu-smi info + + - name: Config mirrors + run: | + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + - name: Install system dependencies + run: | + apt-get update + apt-get install -y \ + git gcc g++ make cmake ninja-build curl \ + libgl1 libglib2.0-0 libsndfile1 + + # See: https://github.com/actions/checkout/issues/363#issuecomment-1915075699 + # See: https://github.com/hunshcn/gh-proxy/issues/28#issuecomment-773769630 + - name: Config git + run: | + git config --global --add safe.directory "$GITHUB_WORKSPACE" + git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/ + + - name: Checkout + uses: actions/checkout@v4 + + - name: Checkout torchtune + uses: actions/checkout@v4 + with: + repository: pytorch/torchtune + path: torchtune + + - name: Install torchtune + working-directory: torchtune + run: | + pip install -e . + + - name: Download torch artifact + if: ${{ inputs.torch-artifact }} + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.torch-artifact }} + + - name: Install torch + if: ${{ inputs.torch-artifact }} + run: | + pip install ${{ inputs.torch-artifact }} + + - name: Install torch_npu dependencies + if: ${{ !inputs.torch-artifact }} + run: | + pip install -r https://raw.githubusercontent.com/Ascend/pytorch/refs/heads/master/requirements.txt + + - name: List torch version + id: list-torch-version + shell: bash + run: | + torch_version=$(python -c "import torch; print(torch.__version__)") + echo "torch-version=${torch_version}" >> $GITHUB_OUTPUT + + - name: Download torch_npu artifact + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.torch-npu-artifact }} + path: ascend_npu + + - name: Install torch_npu + working-directory: ascend_npu + run: | + pip install ${{ inputs.torch-npu-artifact }} + + - name: Show environment info + run: | + pip list + + - name: Download Qwen2.5 model + run: | + export HF_ENDPOINT=https://hf-mirror.com + huggingface-cli download --resume-download Qwen/Qwen2.5-0.5B-Instruct \ + --local-dir /tmp/Qwen2.5-0.5B-Instruct \ + + - name: Run torchtune with lora finetune + run: | + tune run lora_finetune_single_device --config qwen2_5/0.5B_lora_single_device + + - name: Run torchtune with full finetune + run: | + tune run full_finetune_single_device --config qwen2_5/0.5B_full_single_device diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index e6eda7c..d03f115 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -11,6 +11,7 @@ on: - ".github/workflows/_ascend_npu_ut.yml" - ".github/workflows/_ascend_npu_benchmark.yml" - ".github/workflows/_ascend_npu_torchtitan.yml" + - ".github/workflows/_ascend_npu_torchtune.yml" - ".ci/**" - "ascend_npu/**" - "src/**" @@ -25,6 +26,7 @@ on: - ".github/workflows/_ascend_npu_ut.yml" - ".github/workflows/_ascend_npu_benchmark.yml" - ".github/workflows/_ascend_npu_torchtitan.yml" + - ".github/workflows/_ascend_npu_torchtune.yml" - ".ci/**" - "ascend_npu/**" - "src/**" @@ -120,6 +122,41 @@ jobs: image: ${{ needs.prepare.outputs.image }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} + test: + name: Test torch_npu + needs: + - prepare + - build-torch + - build + if: | + !cancelled() && github.event_name != 'repository_dispatch' && + (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) + uses: ./.github/workflows/_ascend_npu_ut.yml + with: + runner: ${{ needs.prepare.outputs.runner }} + image: ${{ needs.prepare.outputs.image }} + device: ${{ needs.prepare.outputs.device }} + torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} + torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} + + benchmark: + name: Run benchmarks + needs: + - prepare + - build-torch + - build + if: | + !cancelled() && github.event_name != 'repository_dispatch' && + (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) + uses: ./.github/workflows/_ascend_npu_benchmark.yml + with: + runner: ${{ needs.prepare.outputs.runner }} + image: ${{ needs.prepare.outputs.image }} + torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} + torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} + secrets: + pr-token: ${{ secrets.COSDT_BOT_TOKEN }} + torchtitan: name: Run torchtitan needs: @@ -136,3 +173,19 @@ jobs: torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} + torchtune: + name: Run torchtune for torch_npu + needs: + - prepare + - build-torch + - build + if: | + !cancelled() && github.event_name != 'repository_dispatch' && + (success() || (needs.build-torch.result == 'skipped' && needs.build.result == 'success')) + uses: ./.github/workflows/_ascend_npu_torchtune.yml + with: + runner: ${{ needs.prepare.outputs.runner }} + image: ${{ needs.prepare.outputs.image }} + torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} + torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} + From 054edb3e71577bdc0a63ed34a04f11df1d70864e Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Mon, 30 Jun 2025 16:59:04 +0800 Subject: [PATCH 2/6] update --- .github/workflows/ascend_npu_test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ascend_npu_test.yml b/.github/workflows/ascend_npu_test.yml index d03f115..a0b66be 100644 --- a/.github/workflows/ascend_npu_test.yml +++ b/.github/workflows/ascend_npu_test.yml @@ -135,7 +135,6 @@ jobs: with: runner: ${{ needs.prepare.outputs.runner }} image: ${{ needs.prepare.outputs.image }} - device: ${{ needs.prepare.outputs.device }} torch-artifact: ${{ needs.build-torch.outputs.torch-artifact }} torch-npu-artifact: ${{ needs.build.outputs.torch-npu-artifact }} From b412499ce232de91c6f7df7142115a8aef96244f Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Thu, 3 Jul 2025 16:14:27 +0800 Subject: [PATCH 3/6] remove device --- .github/workflows/_ascend_npu_torchtitan.yml | 4 ---- .github/workflows/_ascend_npu_torchtune.yml | 8 -------- 2 files changed, 12 deletions(-) diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml index 07f554a..95d743e 100644 --- a/.github/workflows/_ascend_npu_torchtitan.yml +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -11,10 +11,6 @@ on: required: true type: string description: "The docker image which will be loaded" - device: - required: true - type: string - description: "The device selected to run on" torch-artifact: required: false type: string diff --git a/.github/workflows/_ascend_npu_torchtune.yml b/.github/workflows/_ascend_npu_torchtune.yml index 9562cb7..825c7bd 100644 --- a/.github/workflows/_ascend_npu_torchtune.yml +++ b/.github/workflows/_ascend_npu_torchtune.yml @@ -11,10 +11,6 @@ on: required: true type: string description: "The docker image which will be loaded" - device: - required: true - type: string - description: "The device selected to run on" torch-artifact: required: false type: string @@ -24,10 +20,6 @@ on: type: string description: "The distribution artifact name of torch_npu" -defaults: - run: - shell: bash -el {0} - jobs: torchtune: name: run torchtune for torch_npu From c3d7970b6e0e9c7cbfeab6236a9ef1c467654cc8 Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Thu, 3 Jul 2025 16:24:12 +0800 Subject: [PATCH 4/6] update --- .github/workflows/_ascend_npu_torchtitan.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/_ascend_npu_torchtitan.yml b/.github/workflows/_ascend_npu_torchtitan.yml index 95d743e..fe27ed9 100644 --- a/.github/workflows/_ascend_npu_torchtitan.yml +++ b/.github/workflows/_ascend_npu_torchtitan.yml @@ -19,10 +19,6 @@ on: required: true type: string description: "The distribution artifact name of torch_npu" - secrets: - pr-token: - description: "A token used to create a pull request" - required: true # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly # declared as "shell: bash -el {0}" on steps that need to be properly activated. From 723c8e3fdd676c274df20ceed31c8c8019a9a034 Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Mon, 7 Jul 2025 10:38:30 +0800 Subject: [PATCH 5/6] update --- .github/workflows/_ascend_npu_torchtune.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_ascend_npu_torchtune.yml b/.github/workflows/_ascend_npu_torchtune.yml index 825c7bd..7929e00 100644 --- a/.github/workflows/_ascend_npu_torchtune.yml +++ b/.github/workflows/_ascend_npu_torchtune.yml @@ -67,6 +67,12 @@ jobs: run: | pip install -e . + - name: Download Qwen2.5 model + run: | + export HF_ENDPOINT=https://hf-mirror.com + tune download Qwen/Qwen2.5-0.5B-Instruct \ + --output-dir /tmp/Qwen2.5-0.5B-Instruct + - name: Download torch artifact if: ${{ inputs.torch-artifact }} uses: actions/download-artifact@v4 @@ -103,13 +109,7 @@ jobs: - name: Show environment info run: | - pip list - - - name: Download Qwen2.5 model - run: | - export HF_ENDPOINT=https://hf-mirror.com - huggingface-cli download --resume-download Qwen/Qwen2.5-0.5B-Instruct \ - --local-dir /tmp/Qwen2.5-0.5B-Instruct \ + pip list - name: Run torchtune with lora finetune run: | From 1fbdc82f1e38b62cff1f2dbc527d49958a3eb26a Mon Sep 17 00:00:00 2001 From: Jiahao Su Date: Mon, 7 Jul 2025 10:45:47 +0800 Subject: [PATCH 6/6] update --- .github/workflows/_ascend_npu_torchtune.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/_ascend_npu_torchtune.yml b/.github/workflows/_ascend_npu_torchtune.yml index 7929e00..2d78d46 100644 --- a/.github/workflows/_ascend_npu_torchtune.yml +++ b/.github/workflows/_ascend_npu_torchtune.yml @@ -67,12 +67,6 @@ jobs: run: | pip install -e . - - name: Download Qwen2.5 model - run: | - export HF_ENDPOINT=https://hf-mirror.com - tune download Qwen/Qwen2.5-0.5B-Instruct \ - --output-dir /tmp/Qwen2.5-0.5B-Instruct - - name: Download torch artifact if: ${{ inputs.torch-artifact }} uses: actions/download-artifact@v4 @@ -109,7 +103,13 @@ jobs: - name: Show environment info run: | - pip list + pip list + + - name: Download Qwen2.5 model + run: | + export HF_ENDPOINT=https://hf-mirror.com + tune download Qwen/Qwen2.5-0.5B-Instruct \ + --output-dir /tmp/Qwen2.5-0.5B-Instruct - name: Run torchtune with lora finetune run: |