diff --git a/.ci/benchmark.py b/.ci/benchmark.py index f6c9400..1b26b68 100644 --- a/.ci/benchmark.py +++ b/.ci/benchmark.py @@ -1,6 +1,5 @@ import argparse import os -import sys from src.benchmark.utils import read_metrics, to_markdown_table diff --git a/.github/workflows/_ascend_npu_benchmark.yml b/.github/workflows/_ascend_npu_benchmark.yml index 3217f1f..2f99106 100644 --- a/.github/workflows/_ascend_npu_benchmark.yml +++ b/.github/workflows/_ascend_npu_benchmark.yml @@ -83,7 +83,10 @@ jobs: - name: Checkout benchmark uses: actions/checkout@v4 with: - repository: pytorch/benchmark + # TODO(shink): https://github.com/pytorch/benchmark/pull/2592 + # repository: pytorch/benchmark + repository: shink/benchmark + ref: feat/test_bench/config path: benchmark # TODO @@ -127,6 +130,11 @@ jobs: run: | pip install ${{ inputs.torch-npu-artifact }} + - name: Install torchvision and torchaudio + run: | + # pip install --pre torchvision torchaudio --no-deps --index-url https://download.pytorch.org/whl/nightly/cpu + pip install --pre torchvision torchaudio --no-deps --index-url https://download.pytorch.org/whl/test/cpu + - name: Install benchmark dependencies run: | pip install -r benchmark/requirements.txt \ @@ -137,26 +145,19 @@ jobs: run: | python benchmark/install.py --userbenchmark test_bench --continue_on_fail - - name: Install nightly torchvision and torchaudio - run: | - pip install --pre torchvision torchaudio --no-deps --index-url https://download.pytorch.org/whl/nightly/cpu - - name: Install project dependencies run: | pip install -r requirements.txt - - name: Show environment info + - name: List installed python packages + if: ${{ always() }} run: | - npu_is_available=$(python -c "import torch; print(torch.npu.is_available())") - npu_count=$(python -c "import torch; print(torch.npu.device_count())") - echo "NPU is available: ${npu_is_available}" - echo "NPU count: ${npu_count}" pip list | grep -E 'torch|numpy' - name: Run benchmarks - working-directory: benchmark run: | - python run_benchmark.py test_bench --accuracy --device npu --test eval \ + python benchmark/run_benchmark.py test_bench \ + --config ascend_npu/torchbenchmark-config.yml \ --output ascend_npu_benchmark.json - name: Upload the benchmark report file @@ -166,7 +167,7 @@ jobs: name: ascend_npu_benchmark.json path: benchmark/ascend_npu_benchmark.json if-no-files-found: error - retention-days: 1 + retention-days: 3 overwrite: true - name: Write to workflow job summary diff --git a/.github/workflows/_ascend_npu_build_torch_npu.yml b/.github/workflows/_ascend_npu_build_torch_npu.yml index 00b1815..b6ded9c 100644 --- a/.github/workflows/_ascend_npu_build_torch_npu.yml +++ b/.github/workflows/_ascend_npu_build_torch_npu.yml @@ -73,11 +73,7 @@ jobs: - name: Checkout torch_npu uses: actions/checkout@v4 with: - # TODO(shink): Use Ascend/pytorch once this pr merged: - # https://gitee.com/ascend/pytorch/pulls/12854 - # repository: Ascend/pytorch - repository: shink/torchnpu - ref: feat/autoload + repository: Ascend/pytorch submodules: recursive path: torch_npu diff --git a/.github/workflows/_ascend_npu_ut.yml b/.github/workflows/_ascend_npu_ut.yml index f5ccc85..bfb0035 100644 --- a/.github/workflows/_ascend_npu_ut.yml +++ b/.github/workflows/_ascend_npu_ut.yml @@ -75,11 +75,7 @@ jobs: - name: Checkout torch_npu uses: actions/checkout@v4 with: - # TODO(shink): Use Ascend/pytorch once this pr merged: - # https://gitee.com/ascend/pytorch/pulls/12854 - # repository: Ascend/pytorch - repository: shink/torchnpu - ref: feat/autoload + repository: Ascend/pytorch path: torch_npu - name: Install pip dependencies diff --git a/.github/workflows/dispatch-event.yml b/.github/workflows/dispatch-event.yml index 3722746..4ed367f 100644 --- a/.github/workflows/dispatch-event.yml +++ b/.github/workflows/dispatch-event.yml @@ -31,7 +31,7 @@ jobs: - name: Checkout uses: actions/checkout@v4 - # List PRs created in the past 24 hours + # List PRs created in the past 24 hours and labeled 'ciflow/out-of-tree' - name: List PyTorch PRs id: list-pr uses: ./.github/actions/list-pr @@ -39,7 +39,7 @@ jobs: token: ${{ secrets.COSDT_BOT_TOKEN }} owner: pytorch repository: pytorch - labels: ${{ github.event.inputs.labels || '' }} + labels: ${{ github.event.inputs.labels || 'ciflow/out-of-tree' }} hours: ${{ github.event.inputs.hours || '24' }} dispatch-pr: @@ -49,7 +49,7 @@ jobs: - list-pr strategy: fail-fast: false - max-parallel: 1 + max-parallel: 1 # TODO: We now only support running 1 job at the same time on NPU CI runer matrix: data: ${{ fromJSON(needs.list-pr.outputs.prs) }} steps: diff --git a/ascend_npu/matadata.yml b/ascend_npu/matadata.yml deleted file mode 100644 index c5e2cca..0000000 --- a/ascend_npu/matadata.yml +++ /dev/null @@ -1,10 +0,0 @@ -device: "npu" -backend_extension: "torch_npu" -link: https://github.com/Ascend/pytorch -torchbenchmark: - test: - - train - - eval - models: - skip: - - llava diff --git a/ascend_npu/torchbenchmark-config.yml b/ascend_npu/torchbenchmark-config.yml new file mode 100644 index 0000000..84cba1e --- /dev/null +++ b/ascend_npu/torchbenchmark-config.yml @@ -0,0 +1,10 @@ +devices: + - "npu" +backend_extension: "torch_npu" +link: https://github.com/Ascend/pytorch +models: + - model: llava + skip: true # Out of memory +batch_size: 1 +extra_args: + - "--accuracy" diff --git a/src/benchmark/utils.py b/src/benchmark/utils.py index 983823f..096c70d 100644 --- a/src/benchmark/utils.py +++ b/src/benchmark/utils.py @@ -16,6 +16,7 @@ class TorchBenchModelConfig: extra_args: List[str] extra_env: Optional[Dict[str, str]] = None output_dir: Optional[pathlib.Path] = None + skip: bool = False @dataclasses.dataclass @@ -68,6 +69,7 @@ def read_metrics(path: str, *, metric=None) -> List[TorchBenchModelMetric]: extra_args=key_dict.get("extra_args"), extra_env=key_dict.get("extra_env"), output_dir=key_dict.get("output_dir"), + skip=key_dict.get("skip"), ) model_metric = TorchBenchModelMetric(config, metric_value) metrics.append(model_metric) @@ -79,7 +81,7 @@ def generate_table_rows(metrics: List[TorchBenchModelMetric]): models = list({metric.key.name for metric in metrics}) models = sorted(models, key=lambda x: x.lower()) - def filter_metric(metrics: List[TorchBenchModelMetric], *, model, device): + def _filter_metric(metrics: List[TorchBenchModelMetric], *, model, device): for metric in metrics: if metric.key.name == model and metric.key.device == device: return metric @@ -88,7 +90,7 @@ def filter_metric(metrics: List[TorchBenchModelMetric], *, model, device): for model in models: row = [model] for device in devices: - metric = filter_metric(metrics, model=model, device=device) + metric = _filter_metric(metrics, model=model, device=device) if metric is not None: if metric.value == "pass": cell = "✅"