diff --git a/.ci/benchmark.py b/.ci/benchmark.py
index f6c9400..1b26b68 100644
--- a/.ci/benchmark.py
+++ b/.ci/benchmark.py
@@ -1,6 +1,5 @@
 import argparse
 import os
-import sys
 from src.benchmark.utils import read_metrics, to_markdown_table
 
 
diff --git a/.github/workflows/_ascend_npu_benchmark.yml b/.github/workflows/_ascend_npu_benchmark.yml
index 3217f1f..2f99106 100644
--- a/.github/workflows/_ascend_npu_benchmark.yml
+++ b/.github/workflows/_ascend_npu_benchmark.yml
@@ -83,7 +83,10 @@ jobs:
       - name: Checkout benchmark
         uses: actions/checkout@v4
         with:
-          repository: pytorch/benchmark
+          # TODO(shink): https://github.com/pytorch/benchmark/pull/2592
+          # repository: pytorch/benchmark
+          repository: shink/benchmark
+          ref: feat/test_bench/config
           path: benchmark
 
       # TODO
@@ -127,6 +130,11 @@ jobs:
         run: |
           pip install ${{ inputs.torch-npu-artifact }}
 
+      - name: Install torchvision and torchaudio
+        run: |
+          # pip install --pre torchvision torchaudio --no-deps --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install --pre torchvision torchaudio --no-deps --index-url https://download.pytorch.org/whl/test/cpu
+
       - name: Install benchmark dependencies
         run: |
           pip install -r benchmark/requirements.txt \
@@ -137,26 +145,19 @@ jobs:
         run: |
           python benchmark/install.py --userbenchmark test_bench --continue_on_fail
 
-      - name: Install nightly torchvision and torchaudio
-        run: |
-          pip install --pre torchvision torchaudio --no-deps --index-url https://download.pytorch.org/whl/nightly/cpu
-
       - name: Install project dependencies
         run: |
           pip install -r requirements.txt
 
-      - name: Show environment info
+      - name: List installed python packages
+        if: ${{ always() }}
         run: |
-          npu_is_available=$(python -c "import torch; print(torch.npu.is_available())")
-          npu_count=$(python -c "import torch; print(torch.npu.device_count())")
-          echo "NPU is available: ${npu_is_available}"
-          echo "NPU count: ${npu_count}"
           pip list | grep -E 'torch|numpy'
 
       - name: Run benchmarks
-        working-directory: benchmark
         run: |
-          python run_benchmark.py test_bench --accuracy --device npu --test eval \
+          python benchmark/run_benchmark.py test_bench \
+              --config ascend_npu/torchbenchmark-config.yml \
               --output ascend_npu_benchmark.json
 
       - name: Upload the benchmark report file
@@ -166,7 +167,7 @@ jobs:
           name: ascend_npu_benchmark.json
           path: benchmark/ascend_npu_benchmark.json
           if-no-files-found: error
-          retention-days: 1
+          retention-days: 3
           overwrite: true
 
       - name: Write to workflow job summary
diff --git a/.github/workflows/_ascend_npu_build_torch_npu.yml b/.github/workflows/_ascend_npu_build_torch_npu.yml
index 00b1815..b6ded9c 100644
--- a/.github/workflows/_ascend_npu_build_torch_npu.yml
+++ b/.github/workflows/_ascend_npu_build_torch_npu.yml
@@ -73,11 +73,7 @@ jobs:
       - name: Checkout torch_npu
         uses: actions/checkout@v4
         with:
-          # TODO(shink): Use Ascend/pytorch once this pr merged:
-          # https://gitee.com/ascend/pytorch/pulls/12854
-          # repository: Ascend/pytorch
-          repository: shink/torchnpu
-          ref: feat/autoload
+          repository: Ascend/pytorch
           submodules: recursive
           path: torch_npu
 
diff --git a/.github/workflows/_ascend_npu_ut.yml b/.github/workflows/_ascend_npu_ut.yml
index f5ccc85..bfb0035 100644
--- a/.github/workflows/_ascend_npu_ut.yml
+++ b/.github/workflows/_ascend_npu_ut.yml
@@ -75,11 +75,7 @@ jobs:
       - name: Checkout torch_npu
         uses: actions/checkout@v4
         with:
-          # TODO(shink): Use Ascend/pytorch once this pr merged:
-          # https://gitee.com/ascend/pytorch/pulls/12854
-          # repository: Ascend/pytorch
-          repository: shink/torchnpu
-          ref: feat/autoload
+          repository: Ascend/pytorch
           path: torch_npu
 
       - name: Install pip dependencies
diff --git a/.github/workflows/dispatch-event.yml b/.github/workflows/dispatch-event.yml
index 3722746..4ed367f 100644
--- a/.github/workflows/dispatch-event.yml
+++ b/.github/workflows/dispatch-event.yml
@@ -31,7 +31,7 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
-      # List PRs created in the past 24 hours
+      # List PRs created in the past 24 hours and labeled 'ciflow/out-of-tree'
       - name: List PyTorch PRs
         id: list-pr
         uses: ./.github/actions/list-pr
@@ -39,7 +39,7 @@ jobs:
           token: ${{ secrets.COSDT_BOT_TOKEN }}
           owner: pytorch
           repository: pytorch
-          labels: ${{ github.event.inputs.labels || '' }}
+          labels: ${{ github.event.inputs.labels || 'ciflow/out-of-tree' }}
           hours: ${{ github.event.inputs.hours || '24' }}
 
   dispatch-pr:
@@ -49,7 +49,7 @@ jobs:
       - list-pr
     strategy:
       fail-fast: false
-      max-parallel: 1
+      max-parallel: 1 # TODO: We now only support running 1 job at the same time on NPU CI runer
       matrix:
         data: ${{ fromJSON(needs.list-pr.outputs.prs) }}
     steps:
diff --git a/ascend_npu/matadata.yml b/ascend_npu/matadata.yml
deleted file mode 100644
index c5e2cca..0000000
--- a/ascend_npu/matadata.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-device: "npu"
-backend_extension: "torch_npu"
-link: https://github.com/Ascend/pytorch
-torchbenchmark:
-  test:
-    - train
-    - eval
-  models:
-  skip:
-    - llava
diff --git a/ascend_npu/torchbenchmark-config.yml b/ascend_npu/torchbenchmark-config.yml
new file mode 100644
index 0000000..84cba1e
--- /dev/null
+++ b/ascend_npu/torchbenchmark-config.yml
@@ -0,0 +1,10 @@
+devices:
+  - "npu"
+backend_extension: "torch_npu"
+link: https://github.com/Ascend/pytorch
+models:
+  - model: llava
+    skip: true # Out of memory
+batch_size: 1
+extra_args:
+  - "--accuracy"
diff --git a/src/benchmark/utils.py b/src/benchmark/utils.py
index 983823f..096c70d 100644
--- a/src/benchmark/utils.py
+++ b/src/benchmark/utils.py
@@ -16,6 +16,7 @@ class TorchBenchModelConfig:
     extra_args: List[str]
     extra_env: Optional[Dict[str, str]] = None
     output_dir: Optional[pathlib.Path] = None
+    skip: bool = False
 
 
 @dataclasses.dataclass
@@ -68,6 +69,7 @@ def read_metrics(path: str, *, metric=None) -> List[TorchBenchModelMetric]:
                 extra_args=key_dict.get("extra_args"),
                 extra_env=key_dict.get("extra_env"),
                 output_dir=key_dict.get("output_dir"),
+                skip=key_dict.get("skip"),
             )
             model_metric = TorchBenchModelMetric(config, metric_value)
             metrics.append(model_metric)
@@ -79,7 +81,7 @@ def generate_table_rows(metrics: List[TorchBenchModelMetric]):
     models = list({metric.key.name for metric in metrics})
     models = sorted(models, key=lambda x: x.lower())
 
-    def filter_metric(metrics: List[TorchBenchModelMetric], *, model, device):
+    def _filter_metric(metrics: List[TorchBenchModelMetric], *, model, device):
         for metric in metrics:
             if metric.key.name == model and metric.key.device == device:
                 return metric
@@ -88,7 +90,7 @@ def filter_metric(metrics: List[TorchBenchModelMetric], *, model, device):
     for model in models:
         row = [model]
         for device in devices:
-            metric = filter_metric(metrics, model=model, device=device)
+            metric = _filter_metric(metrics, model=model, device=device)
             if metric is not None:
                 if metric.value == "pass":
                     cell = "✅"